From 2581c3ab60b54dbb4267c2a09b93f2f0b6afa7af Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Sun, 23 Feb 2025 14:26:01 +0100
Subject: [PATCH] ethos: Initial commit of a driver for the Arm Ethos-U65 NPU.

Supports all models in the test suite. No optimizations implemented yet.

Acked-by: Christian Gmeiner <cgmeiner@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36699>
---
 .clang-format-include                         |    1 +
 include/drm-uapi/ethosu_accel.h               |  262 ++++
 meson.build                                   |    3 +-
 meson.options                                 |    2 +-
 src/gallium/drivers/ethosu/.clang-format      |    2 +
 .../drivers/ethosu/ci/ethos-imx93-fails.txt   |    0
 .../drivers/ethosu/ci/ethos-imx93-flakes.txt  |    0
 .../drivers/ethosu/ci/ethos-imx93-skips.txt   |   14 +
 src/gallium/drivers/ethosu/decode.py          |   75 ++
 src/gallium/drivers/ethosu/ethosu_cmd.c       |  783 +++++++++++
 src/gallium/drivers/ethosu/ethosu_cmd.h       |   13 +
 src/gallium/drivers/ethosu/ethosu_coefs.c     |  133 ++
 src/gallium/drivers/ethosu/ethosu_coefs.h     |   17 +
 src/gallium/drivers/ethosu/ethosu_device.c    |  243 ++++
 src/gallium/drivers/ethosu/ethosu_device.h    |   84 ++
 src/gallium/drivers/ethosu/ethosu_lower.c     |  477 +++++++
 src/gallium/drivers/ethosu/ethosu_lower.h     |   15 +
 src/gallium/drivers/ethosu/ethosu_ml.c        |  363 +++++
 src/gallium/drivers/ethosu/ethosu_ml.h        |  229 ++++
 src/gallium/drivers/ethosu/ethosu_sched.c     |  193 +++
 src/gallium/drivers/ethosu/ethosu_sched.h     |   13 +
 src/gallium/drivers/ethosu/gen_header.py      |  125 ++
 src/gallium/drivers/ethosu/gen_parser.py      |  745 +++++++++++
 src/gallium/drivers/ethosu/meson.build        |   33 +
 .../drivers/ethosu/mlw_codec/mlw_common.h     |   29 +
 .../drivers/ethosu/mlw_codec/mlw_encode.c     | 1186 +++++++++++++++++
 .../drivers/ethosu/mlw_codec/mlw_encode.h     |   65 +
 src/gallium/drivers/ethosu/registers.xml      |  399 ++++++
 src/gallium/drivers/ethosu/rules-ng.xsd       |  457 +++++++
 src/gallium/meson.build                       |    6 +
 src/gallium/targets/dri/meson.build           |    2 +-
 .../winsys/ethosu/drm/ethosu_drm_public.h     |   17 +
 .../winsys/ethosu/drm/ethosu_drm_winsys.c     |   19 +
 src/gallium/winsys/ethosu/drm/meson.build     |   13 +
 34 files changed, 6015 insertions(+), 3 deletions(-)
 create mode 100644 include/drm-uapi/ethosu_accel.h
 create mode 100644 src/gallium/drivers/ethosu/.clang-format
 create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt
 create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt
 create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt
 create mode 100644 src/gallium/drivers/ethosu/decode.py
 create mode 100644 src/gallium/drivers/ethosu/ethosu_cmd.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_cmd.h
 create mode 100644 src/gallium/drivers/ethosu/ethosu_coefs.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_coefs.h
 create mode 100644 src/gallium/drivers/ethosu/ethosu_device.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_device.h
 create mode 100644 src/gallium/drivers/ethosu/ethosu_lower.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_lower.h
 create mode 100644 src/gallium/drivers/ethosu/ethosu_ml.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_ml.h
 create mode 100644 src/gallium/drivers/ethosu/ethosu_sched.c
 create mode 100644 src/gallium/drivers/ethosu/ethosu_sched.h
 create mode 100644 src/gallium/drivers/ethosu/gen_header.py
 create mode 100644 src/gallium/drivers/ethosu/gen_parser.py
 create mode 100644 src/gallium/drivers/ethosu/meson.build
 create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_common.h
 create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c
 create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h
 create mode 100644 src/gallium/drivers/ethosu/registers.xml
 create mode 100644 src/gallium/drivers/ethosu/rules-ng.xsd
 create mode 100644 src/gallium/winsys/ethosu/drm/ethosu_drm_public.h
 create mode 100644 src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c
 create mode 100644 src/gallium/winsys/ethosu/drm/meson.build

diff --git a/.clang-format-include b/.clang-format-include
index d7c5747177f..ba52553fdc9 100644
--- a/.clang-format-include
+++ b/.clang-format-include
@@ -1,6 +1,7 @@
 # The following files are opted into `ninja clang-format` and
 # enforcement in the CI.
 
+src/gallium/drivers/ethosu/**/*
 src/gallium/drivers/i915
 src/gallium/drivers/r300/compiler/*
 src/gallium/drivers/rocket/**/*
diff --git a/include/drm-uapi/ethosu_accel.h b/include/drm-uapi/ethosu_accel.h
new file mode 100644
index 00000000000..135d6480e3a
--- /dev/null
+++ b/include/drm-uapi/ethosu_accel.h
@@ -0,0 +1,262 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (C) 2025 Arm, Ltd. */
+#ifndef _ETHOSU_DRM_H_
+#define _ETHOSU_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * DOC: IOCTL IDs
+ *
+ * enum drm_ethosu_ioctl_id - IOCTL IDs
+ *
+ * Place new ioctls at the end, don't re-order, don't replace or remove entries.
+ *
+ * These IDs are not meant to be used directly. Use the DRM_IOCTL_ETHOSU_xxx
+ * definitions instead.
+ */
+enum drm_ethosu_ioctl_id {
+	/** @DRM_ETHOSU_DEV_QUERY: Query device information. */
+	DRM_ETHOSU_DEV_QUERY = 0,
+
+	/** @DRM_ETHOSU_BO_CREATE: Create a buffer object. */
+	DRM_ETHOSU_BO_CREATE,
+
+	/** @DRM_ETHOSU_BO_WAIT: Wait on a buffer object's fence. */
+	DRM_ETHOSU_BO_WAIT,
+
+	/**
+	 * @DRM_ETHOSU_BO_MMAP_OFFSET: Get the file offset to pass to
+	 * mmap to map a GEM object.
+	 */
+	DRM_ETHOSU_BO_MMAP_OFFSET,
+
+	/**
+	 * @DRM_ETHOSU_CMDSTREAM_BO_CREATE: Create a command stream buffer
+	 * object.
+	 */
+	DRM_ETHOSU_CMDSTREAM_BO_CREATE,
+
+	/** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */
+	DRM_ETHOSU_SUBMIT,
+};
+
+/**
+ * DOC: IOCTL arguments
+ */
+
+/**
+ * enum drm_ethosu_dev_query_type - Query type
+ *
+ * Place new types at the end, don't re-order, don't remove or replace.
+ */
+enum drm_ethosu_dev_query_type {
+	/** @DRM_ETHOSU_DEV_QUERY_NPU_INFO: Query NPU information. */
+	DRM_ETHOSU_DEV_QUERY_NPU_INFO = 0,
+};
+
+/**
+ * struct drm_ethosu_gpu_info - NPU information
+ *
+ * Structure grouping all queryable information relating to the NPU.
+ */
+struct drm_ethosu_npu_info {
+	/** @id : NPU ID. */
+	__u32 id;
+#define DRM_ETHOSU_ARCH_MAJOR(x)			((x) >> 28)
+#define DRM_ETHOSU_ARCH_MINOR(x)			(((x) >> 20) & 0xff)
+#define DRM_ETHOSU_ARCH_PATCH(x)			(((x) >> 16) & 0xf)
+#define DRM_ETHOSU_PRODUCT_MAJOR(x)		(((x) >> 12) & 0xf)
+#define DRM_ETHOSU_VERSION_MAJOR(x)		(((x) >> 8) & 0xf)
+#define DRM_ETHOSU_VERSION_MINOR(x)		(((x) >> 4) & 0xff)
+#define DRM_ETHOSU_VERSION_STATUS(x)		((x) & 0xf)
+
+	/** @gpu_rev: GPU revision. */
+	__u32 config;
+
+	__u32 sram_size;
+};
+/**
+ * struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY
+ */
+struct drm_ethosu_dev_query {
+	/** @type: the query type (see drm_ethosu_dev_query_type). */
+	__u32 type;
+
+	/**
+	 * @size: size of the type being queried.
+	 *
+	 * If pointer is NULL, size is updated by the driver to provide the
+	 * output structure size. If pointer is not NULL, the driver will
+	 * only copy min(size, actual_structure_size) bytes to the pointer,
+	 * and update the size accordingly. This allows us to extend query
+	 * types without breaking userspace.
+	 */
+	__u32 size;
+
+	/**
+	 * @pointer: user pointer to a query type struct.
+	 *
+	 * Pointer can be NULL, in which case, nothing is copied, but the
+	 * actual structure size is returned. If not NULL, it must point to
+	 * a location that's large enough to hold size bytes.
+	 */
+	__u64 pointer;
+};
+
+/**
+ * enum drm_ethosu_bo_flags - Buffer object flags, passed at creation time.
+ */
+enum drm_ethosu_bo_flags {
+	/**
+	 * @DRM_ETHOSU_BO_NO_MMAP: The buffer object will never be CPU-mapped
+	 * in userspace.
+	 */
+	DRM_ETHOSU_BO_NO_MMAP = (1 << 0),
+};
+
+/**
+ * struct drm_ethosu_bo_create - Arguments passed to DRM_IOCTL_ETHOSU_BO_CREATE.
+ */
+struct drm_ethosu_bo_create {
+	/**
+	 * @size: Requested size for the object
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+
+	/**
+	 * @flags: Flags. Must be a combination of drm_ethosu_bo_flags flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @handle: Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+};
+
+/**
+ * struct drm_ethosu_bo_mmap_offset - Arguments passed to DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET.
+ */
+struct drm_ethosu_bo_mmap_offset {
+	/** @handle: Handle of the object we want an mmap offset for. */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @offset: The fake offset to use for subsequent mmap calls. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_ethosu_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_ETHOSU_SUBMIT on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_ethosu_bo_wait {
+	__u32 handle;
+	__u32 pad;
+	__s64 timeout_ns;	/* absolute */
+};
+
+
+struct drm_ethosu_cmdstream_bo_create {
+	/* Size of the data argument. */
+	__u32 size;
+
+	/* Flags, currently must be 0. */
+	__u32 flags;
+
+	/* Pointer to the data. */
+	__u64 data;
+
+	/** Returned GEM handle for the BO. */
+	__u32 handle;
+
+	/* Pad, must be 0. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_ethosu_job - A job to be run on the NPU
+ *
+ * The kernel will schedule the execution of this job taking into account its
+ * dependencies with other jobs. All tasks in the same job will be executed
+ * sequentially on the same core, to benefit from memory residency in SRAM.
+ */
+struct drm_ethosu_job {
+	/** Input: BO handle for cmdstream. */
+	__u32 cmd_bo;
+
+	/** Input: Amount of SRAM to use. */
+	__u32 sram_size;
+
+#define ETHOSU_MAX_REGIONS	8
+	/** Input: Array of BO handles for each region. */
+	__u32 region_bo_handles[ETHOSU_MAX_REGIONS];
+};
+
+/**
+ * struct drm_ethosu_submit - ioctl argument for submitting commands to the NPU.
+ *
+ * The kernel will schedule the execution of these jobs in dependency order.
+ */
+struct drm_ethosu_submit {
+	/** Input: Pointer to an array of struct drm_ethosu_job. */
+	__u64 jobs;
+
+	/** Input: Number of jobs passed in. */
+	__u32 job_count;
+
+	/** Reserved, must be zero. */
+	__u32 pad;
+};
+
+
+/**
+ * DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number
+ * @__access: Access type. Must be R, W or RW.
+ * @__id: One of the DRM_ETHOSU_xxx id.
+ * @__type: Suffix of the type being passed to the IOCTL.
+ *
+ * Don't use this macro directly, use the DRM_IOCTL_ETHOSU_xxx
+ * values instead.
+ *
+ * Return: An IOCTL number to be passed to ioctl() from userspace.
+ */
+#define DRM_IOCTL_ETHOSU(__access, __id, __type) \
+	DRM_IO ## __access(DRM_COMMAND_BASE + DRM_ETHOSU_ ## __id, \
+			   struct drm_ethosu_ ## __type)
+
+enum {
+	DRM_IOCTL_ETHOSU_DEV_QUERY =
+		DRM_IOCTL_ETHOSU(WR, DEV_QUERY, dev_query),
+	DRM_IOCTL_ETHOSU_BO_CREATE =
+		DRM_IOCTL_ETHOSU(WR, BO_CREATE, bo_create),
+	DRM_IOCTL_ETHOSU_BO_WAIT =
+		DRM_IOCTL_ETHOSU(WR, BO_WAIT, bo_wait),
+	DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET =
+		DRM_IOCTL_ETHOSU(WR, BO_MMAP_OFFSET, bo_mmap_offset),
+	DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE =
+		DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
+	DRM_IOCTL_ETHOSU_SUBMIT =
+		DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _ETHOSU_DRM_H_ */
diff --git a/meson.build b/meson.build
index 9e72f4a963e..c7ed21a0961 100644
--- a/meson.build
+++ b/meson.build
@@ -186,7 +186,7 @@ elif gallium_drivers.contains('all')
    gallium_drivers = [
      'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
      'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
-     'zink', 'd3d12', 'asahi', 'rocket'
+     'zink', 'd3d12', 'asahi', 'rocket', 'ethosu'
    ]
 endif
 
@@ -214,6 +214,7 @@ with_gallium_zink = gallium_drivers.contains('zink')
 with_gallium_d3d12 = gallium_drivers.contains('d3d12')
 with_gallium_asahi = gallium_drivers.contains('asahi')
 with_gallium_rocket = gallium_drivers.contains('rocket')
+with_gallium_ethosu = gallium_drivers.contains('ethosu')
 foreach gallium_driver : gallium_drivers
   pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
diff --git a/meson.options b/meson.options
index 27d778a3c7a..b1f98d7452a 100644
--- a/meson.options
+++ b/meson.options
@@ -86,7 +86,7 @@ option(
   value : ['auto'],
   choices : [
     'all', 'auto',
-    'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris',
+    'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
     'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
     'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
   ],
diff --git a/src/gallium/drivers/ethosu/.clang-format b/src/gallium/drivers/ethosu/.clang-format
new file mode 100644
index 00000000000..34cd9d7d1d3
--- /dev/null
+++ b/src/gallium/drivers/ethosu/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: InheritParentConfig
+DisableFormat: false
diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt
new file mode 100644
index 00000000000..65e55edf082
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt
@@ -0,0 +1,14 @@
+Add.Op/.*
+AddQuant.Op/.*
+Conv2D.Op/.*
+DepthwiseConv2D.Op/.*
+FullyConnected.Op/.*
+
+# Don't support unfused Pad operations yet
+Models.Op/yolox_000
+Models.Op/yolox_003
+Models.Op/yolox_012
+Models.Op/yolox_027
+Models.Op/yolox_042
+Models.Op/yolox_077
+Models.Op/yolox_086
diff --git a/src/gallium/drivers/ethosu/decode.py b/src/gallium/drivers/ethosu/decode.py
new file mode 100644
index 00000000000..6bc4a5780c8
--- /dev/null
+++ b/src/gallium/drivers/ethosu/decode.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python3
+#
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import struct
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--xml', type=str, required=True)
+	parser.add_argument('--dump', type=str, required=True)
+
+	args = parser.parse_args()
+
+	p = Parser()
+
+	try:
+		p.parse("", args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	regs = {}
+	for e in p.file:
+		if isinstance(e, Reg):
+			regs[e.offset] = e
+
+	domains = {}
+	for e in p.file:
+		if isinstance(e, Enum):
+			if e.name == "target":
+				for name, val in e.values:
+					domains[name] = val
+
+	f = open(args.dump, mode='rb')
+	for i in range(0, os.path.getsize(args.dump) // 8):
+		cmd = f.read(8)
+		(offset, value, target) = struct.unpack("<hIh", cmd)
+		if offset in regs.keys():
+			reg = regs[offset]
+
+			if (target & 0xfffffffe) != domains[reg.domain]:
+				print("WARNING: target 0x%x doesn't match register's domain 0x%x" % (target, domains[reg.domain]))
+
+			print("EMIT(REG_%s, " % regs[offset].full_name.upper(), end="")
+			first = True
+			if value == 0 or len(reg.bitset.fields) == 1:
+				print("0x%x" % value, end="")
+			else:
+				for field in reg.bitset.fields:
+					if field.type == "boolean":
+						if 1 << field.high & value:
+							if not first:
+								print(" | ", end="")
+							print("%s_%s" % (reg.full_name.upper(), field.name.upper()), end="")
+							first = False
+					elif field.type == "uint":
+						field_value = (value & mask(field.low, field.high)) >> field.low
+						if field_value != 0:
+							if not first:
+								print(" | ", end="")
+							print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="")
+							first = False
+			print(");")
+		else:
+			print("%x %x %x" % (target, offset, value))
+
+if __name__ == '__main__':
+	main()
diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.c b/src/gallium/drivers/ethosu/ethosu_cmd.c
new file mode 100644
index 00000000000..fd5fe795602
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_cmd.c
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <fcntl.h>
+#include <math.h>
+#include <stdbool.h>
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+
+#include "ethosu_cmd.h"
+#include "ethosu_coefs.h"
+#include "ethosu_ml.h"
+#include "ethosu_registers.h"
+#include "ethosu_sched.h"
+
+#define MAX_BLOCKDEP            3
+#define MAX_OUTSTANDING_DMA_OPS 2
+#define MAX_OUTSTANDING_NPU_OPS 2
+
+enum ethosu_op_to_scale {
+   OP_NONE = 0,
+   OP_A = 1,
+   OP_B = 2,
+};
+
+static void
+ethosu_ensure_cmdstream(struct ethosu_subgraph *subgraph)
+{
+   if ((subgraph->cursor - subgraph->cmdstream) < (subgraph->cmdstream_used - 2))
+      return;
+
+   unsigned cur_size = subgraph->cursor - subgraph->cmdstream;
+   subgraph->cmdstream = realloc(subgraph->cmdstream, (subgraph->cmdstream_used + 32) * sizeof(*subgraph->cmdstream));
+   subgraph->cursor = subgraph->cmdstream + cur_size;
+   subgraph->cmdstream_used += 32;
+}
+
+#define EMIT0(cmd, param)                                                                      \
+   do {                                                                                        \
+      ethosu_ensure_cmdstream(subgraph);                                                       \
+      *(subgraph->cursor++) = cmd | (((param) & 0xFFFF) << 16);                                \
+      if (DBG_ENABLED(ETHOSU_DBG_MSGS))                                                        \
+         fprintf(stderr, "emit0(%s, 0x%x);\n", ethosu_get_cmd_name(0, cmd), (param) & 0xFFFF); \
+   } while (0)
+
+#define EMIT1(cmd, param, offset)                                                                                   \
+   do {                                                                                                             \
+      ethosu_ensure_cmdstream(subgraph);                                                                            \
+      *(subgraph->cursor++) = cmd | 0x4000 | (((param) & 0xFFFF) << 16);                                            \
+      *(subgraph->cursor++) = (offset) & 0xFFFFFFFF;                                                                \
+      if (DBG_ENABLED(ETHOSU_DBG_MSGS))                                                                             \
+         fprintf(stderr, "emit1(%s, 0x%x, 0x%x);\n", ethosu_get_cmd_name(1, cmd), (param) & 0xFFFF, (int)(offset)); \
+   } while (0)
+
+static void
+emit_addresses(
+   struct ethosu_subgraph *subgraph,
+   struct ethosu_feature_map *feature_map,
+   uint32_t cmd_base0, uint32_t cmd_base1, uint32_t cmd_base2, uint32_t cmd_base3)
+{
+   EMIT1(cmd_base0, 0x0, feature_map->tiles.addresses[0]);
+   EMIT1(cmd_base1, 0x0, feature_map->tiles.addresses[1]);
+   EMIT1(cmd_base2, 0x0, feature_map->tiles.addresses[2]);
+   EMIT1(cmd_base3, 0x0, feature_map->tiles.addresses[3]);
+}
+
+static void
+emit_tiles(
+   struct ethosu_subgraph *subgraph,
+   struct ethosu_feature_map *feature_map,
+   uint32_t cmd_height0, uint32_t cmd_height1, uint32_t cmd_width0)
+{
+   EMIT0(cmd_height0, feature_map->tiles.height_0 - 1);
+   EMIT0(cmd_height1, feature_map->tiles.height_1 - 1);
+   EMIT0(cmd_width0, feature_map->tiles.width_0 - 1);
+}
+
+static void
+emit_strides(
+   struct ethosu_subgraph *subgraph,
+   struct ethosu_feature_map *feature_map,
+   uint32_t cmd_stride_c, uint32_t cmd_stride_y, uint32_t cmd_stride_x)
+{
+   unsigned elem_size = 1;
+   unsigned tensor_x, tensor_y, tensor_c;
+   struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
+
+   if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
+      tensor_x = 16 * elem_size;
+      tensor_c = tensor_x * tensor->shape.width;
+      tensor_y = elem_size * tensor->shape.width * ALIGN(tensor->shape.depth, 16);
+   } else {
+      tensor_c = elem_size;
+      tensor_x = tensor->shape.depth * tensor_c;
+      tensor_y = tensor->shape.width * tensor_x;
+   }
+
+   EMIT1(cmd_stride_c, 0x0, tensor_c);
+   EMIT1(cmd_stride_y, 0x0, tensor_y);
+   EMIT1(cmd_stride_x, 0x0, tensor_x);
+}
+
+static void
+emit_ifm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
+{
+   EMIT0(NPU_SET_IFM_REGION, IO_REGION);
+   emit_addresses(
+      subgraph,
+      feature_map,
+      NPU_SET_IFM_BASE0,
+      NPU_SET_IFM_BASE1,
+      NPU_SET_IFM_BASE2,
+      NPU_SET_IFM_BASE3);
+
+   emit_tiles(
+      subgraph, feature_map, NPU_SET_IFM_HEIGHT0_M1, NPU_SET_IFM_HEIGHT1_M1, NPU_SET_IFM_WIDTH0_M1);
+
+   EMIT0(NPU_SET_IFM_DEPTH_M1, feature_map->shape.depth - 1);
+   emit_strides(subgraph, feature_map, NPU_SET_IFM_STRIDE_C, NPU_SET_IFM_STRIDE_Y, NPU_SET_IFM_STRIDE_X);
+   EMIT0(NPU_SET_IFM_ZERO_POINT, feature_map->zero_point);
+}
+
+static void
+emit_ifm_precision(struct ethosu_subgraph *subgraph,
+                   struct ethosu_feature_map *feature_map,
+                   enum ethosu_op_to_scale op_to_scale, uint32_t precision_cmd)
+{
+   struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
+   unsigned prec = 0;
+
+   if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
+      prec |= NPU_SET_IFM_PRECISION_FORMAT(1);
+
+   if (feature_map->is_signed)
+      prec |= NPU_SET_IFM_PRECISION_ACTIVATION(1); // signed activation
+
+   prec |= NPU_SET_IFM_PRECISION_SCALE_MODE(op_to_scale);
+
+   EMIT0(precision_cmd, prec);
+}
+
+static void
+emit_padding(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_IFM_PAD_TOP, operation->pad.top);
+   EMIT0(NPU_SET_IFM_PAD_LEFT, operation->pad.left);
+   EMIT0(NPU_SET_IFM_PAD_BOTTOM, operation->pad.bottom);
+   EMIT0(NPU_SET_IFM_PAD_RIGHT, operation->pad.right);
+}
+
+static void
+emit_ofm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
+{
+   EMIT0(NPU_SET_OFM_REGION, IO_REGION);
+   emit_addresses(
+      subgraph,
+      feature_map,
+      NPU_SET_OFM_BASE0,
+      NPU_SET_OFM_BASE1,
+      NPU_SET_OFM_BASE2,
+      NPU_SET_OFM_BASE3);
+   emit_tiles(
+      subgraph, feature_map, NPU_SET_OFM_HEIGHT0_M1, NPU_SET_OFM_HEIGHT1_M1, NPU_SET_OFM_WIDTH0_M1);
+   EMIT0(NPU_SET_OFM_HEIGHT_M1, feature_map->shape.height - 1);
+   EMIT0(NPU_SET_OFM_WIDTH_M1, feature_map->shape.width - 1);
+   EMIT0(NPU_SET_OFM_DEPTH_M1, feature_map->shape.depth - 1);
+   emit_strides(subgraph, feature_map, NPU_SET_OFM_STRIDE_C, NPU_SET_OFM_STRIDE_Y, NPU_SET_OFM_STRIDE_X);
+   EMIT0(NPU_SET_OFM_ZERO_POINT, feature_map->zero_point);
+}
+
+static void
+emit_ofm_precision(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx);
+   unsigned prec = 0;
+
+   if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
+      prec |= NPU_SET_OFM_PRECISION_FORMAT(1);
+
+   if (operation->ofm.is_signed)
+      prec |= NPU_SET_OFM_PRECISION_ACTIVATION(1);
+
+   if (operation->type == ETHOSU_OPERATION_TYPE_POOLING ||
+       operation->type == ETHOSU_OPERATION_TYPE_ELTWISE) {
+      prec |= NPU_SET_OFM_PRECISION_SCALE_MODE(1);
+   }
+
+   prec |= NPU_SET_OFM_PRECISION_ROUND_MODE(operation->round_mode);
+
+   EMIT0(NPU_SET_OFM_PRECISION, prec);
+}
+
+static void
+emit_kernel(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_KERNEL_HEIGHT_M1, operation->kernel.height - 1);
+   EMIT0(NPU_SET_KERNEL_WIDTH_M1, operation->kernel.width - 1);
+   unsigned stride = (operation->kernel.stride_x - 1) & 1;
+   stride |= ((operation->kernel.stride_y - 1) & 1) << 1;
+   stride |= ((operation->kernel.stride_x - 1) >> 1) << 6;
+   stride |= ((operation->kernel.stride_y - 1) >> 1) << 9;
+   stride |= (operation->kernel.dilation_x - 1) << 3;
+   stride |= (operation->kernel.dilation_y - 1) << 4;
+   stride |= operation->conv.part_kernel_first << 2;
+   EMIT0(NPU_SET_KERNEL_STRIDE, stride);
+}
+
+static void
+emit_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_WEIGHT_REGION, operation->conv.weights.region);
+   EMIT1(NPU_SET_WEIGHT_BASE, 0x0, operation->conv.weights.address);
+   EMIT1(NPU_SET_WEIGHT_LENGTH, 0x0, operation->conv.weights.size);
+}
+
+static void
+emit_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_SCALE_REGION, operation->conv.scales.region);
+   EMIT1(NPU_SET_SCALE_BASE, 0x0, operation->conv.scales.address);
+   EMIT1(NPU_SET_SCALE_LENGTH, 0x0, operation->conv.scales.size);
+}
+
+static void
+emit_activation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_ACTIVATION, 0x0);
+
+   if (operation->ofm.is_signed) {
+      EMIT0(NPU_SET_ACTIVATION_MIN, 0xff80);
+      EMIT0(NPU_SET_ACTIVATION_MAX, 0x7f);
+   } else {
+      EMIT0(NPU_SET_ACTIVATION_MIN, 0x00);
+      EMIT0(NPU_SET_ACTIVATION_MAX, 0xff);
+   }
+}
+
+static void
+emit_block_config(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_OFM_BLK_HEIGHT_M1, operation->block_config.ofm_block.height - 1);
+   EMIT0(NPU_SET_OFM_BLK_WIDTH_M1, operation->block_config.ofm_block.width - 1);
+   EMIT0(NPU_SET_OFM_BLK_DEPTH_M1, operation->block_config.ofm_block.depth - 1);
+}
+
+static void
+emit_shram_registers(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_IFM_IB_END, operation->block_config.shram_layout.ib_end);
+   EMIT0(NPU_SET_AB_START, operation->block_config.shram_layout.ab_start);
+
+   if (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE)
+      EMIT0(NPU_SET_IFM2_IB_START, operation->block_config.shram_layout.ib_start2);
+
+   EMIT0(NPU_SET_ACC_FORMAT, operation->block_config.acc_type);
+}
+
+static void
+emit_common(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, enum ethosu_op_to_scale op_to_scale)
+{
+   emit_ifm(subgraph, &operation->ifm);
+   emit_ifm_precision(subgraph, &operation->ifm, op_to_scale, NPU_SET_IFM_PRECISION);
+   EMIT0(NPU_SET_IFM_UPSCALE, operation->upscale);
+
+   if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
+      emit_padding(subgraph, operation);
+
+   emit_ofm(subgraph, &operation->ofm);
+
+   emit_ofm_precision(subgraph, operation);
+
+   if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
+      emit_kernel(subgraph, operation);
+
+   if (operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION) {
+      emit_weights(subgraph, operation);
+      emit_biases(subgraph, operation);
+   }
+
+   emit_activation(subgraph, operation);
+
+   emit_block_config(subgraph, operation);
+   if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen)))
+      emit_shram_registers(subgraph, operation);
+   else
+      EMIT0(NPU_SET_ACC_FORMAT, 0x300); // FIXME should be based on # of MACs, only works for >=256 MACs
+}
+
+static void
+emit_convolution(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   ethosu_allocate_feature_map(subgraph, &operation->ifm);
+   operation->ifm.tiles.height_0 = operation->ifm.shape.height;
+   operation->ifm.tiles.height_1 = operation->ifm.shape.height;
+   operation->ifm.tiles.width_0 = operation->ifm.shape.width;
+
+   ethosu_allocate_feature_map(subgraph, &operation->ofm);
+   operation->ofm.tiles.height_0 = operation->ofm.shape.height;
+   operation->ofm.tiles.height_1 = operation->ofm.shape.height;
+   operation->ofm.tiles.width_0 = operation->ofm.shape.width;
+
+   emit_common(subgraph, operation, false);
+}
+
+static unsigned
+quantise_pooling_scale(unsigned nr_kernel_elements, unsigned rescale_bits, unsigned *out_shift)
+{
+   int k = 0;
+   long long N = 0;
+
+   frexp(nr_kernel_elements - 1, &k);
+   N = 31 - rescale_bits;
+   *out_shift = N + k;
+
+   return ((1LL << (N + k)) + (1LL << k)) / nr_kernel_elements;
+}
+
+static unsigned
+pooling_emit_ofm_scaling(
+   double input1_scale,
+   double output_scale,
+   unsigned kernel_height,
+   unsigned kernel_width,
+   uint32_t *out_shift)
+{
+   double rescale = input1_scale / output_scale;
+   unsigned rescale_bits = 0;
+   unsigned scale;
+
+   if (kernel_height == 1 && kernel_width == 1) {
+      if (rescale > 1.0)
+         rescale_bits = 32 - __builtin_clz(ceil(rescale)) + 1;
+      else if (rescale < 1.0)
+         rescale_bits = -(32 - __builtin_clz(ceil(1 / rescale))) - 1;
+   }
+   scale = quantise_pooling_scale(kernel_height * kernel_width, rescale_bits, out_shift);
+   scale = ceil(scale * rescale);
+   return scale;
+}
+
+static void
+emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   unsigned scale;
+   unsigned scale_shift;
+
+   emit_common(subgraph, operation, false);
+
+   if (operation->pooling.avg) {
+      scale = pooling_emit_ofm_scaling(
+         operation->ifm.scale,
+         operation->ofm.scale,
+         operation->kernel.height,
+         operation->kernel.width,
+         &scale_shift);
+
+      EMIT1(NPU_SET_OFM_SCALE, scale_shift, scale);
+   }
+}
+
+static void
+emit_ifm2(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, bool has_scalar)
+{
+   if (!has_scalar) {
+      EMIT0(NPU_SET_IFM2_REGION, IO_REGION);
+      emit_addresses(subgraph, &operation->ifm2, NPU_SET_IFM2_BASE0, NPU_SET_IFM2_BASE1, NPU_SET_IFM2_BASE2, NPU_SET_IFM2_BASE3);
+      emit_tiles(subgraph, &operation->ifm2, NPU_SET_IFM2_HEIGHT0_M1, NPU_SET_IFM2_HEIGHT1_M1, NPU_SET_IFM2_WIDTH0_M1);
+      emit_strides(subgraph, &operation->ifm2, NPU_SET_IFM2_STRIDE_C, NPU_SET_IFM2_STRIDE_Y, NPU_SET_IFM2_STRIDE_X);
+   }
+   EMIT0(NPU_SET_IFM2_ZERO_POINT, operation->ifm2.zero_point);
+}
+
+static void
+emit_ifm2_broadcast(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   unsigned ifm2_broadcast = 0;
+
+   EMIT0(NPU_SET_IFM2_BROADCAST, ifm2_broadcast);
+}
+
+/*
+def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
+        input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
+        input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
+        output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
+
+        if npu_op.activation is not None and npu_op.activation.op_type in (
+            NpuActivationOp.SIGMOID,
+            NpuActivationOp.TANH,
+        ):
+            output_scale = 1 / 0x3000
+
+        if npu_op.sub_op_type == NpuElementWiseOp.MUL:
+            if npu_op.rescale:
+                ofm_scale, shift = npu_op.rescale
+            elif None in (input_scale, input2_scale, output_scale):
+                ofm_scale = 1
+                shift = 0
+            else:
+                ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
+        else:  # Add/Sub
+            # Default operand scaling is no scaling
+            opa_scale = opb_scale = 1
+            opa_shift = 0
+            bitdepth = npu_op.ifm.data_type.size_in_bits()
+            use_advanced_scaling = False
+            if npu_op.rescale is not None:
+                # Explicit ofm scaling
+                ofm_scale, shift = npu_op.rescale
+            elif None in (input_scale, input2_scale, output_scale):
+                # No ofm scaling
+                ofm_scale = 1
+                shift = 0
+            elif input_scale == input2_scale and bitdepth == 16:
+                # int16 same scaling
+                opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
+                    input_scale, input2_scale, output_scale
+                )
+                # align the double rounding with that of advanced scaling
+                opa_scale //= 2
+                opb_scale //= 2
+                shift -= 1
+                opa_shift = 0  # Unused for this case
+            elif input_scale == input2_scale:
+                # Same scaling
+                opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
+                    input_scale, input2_scale, output_scale
+                )
+                opa_shift = 0  # Unused for this case
+                # For 8 bit we can't guarantee double rounding with simplified scaling will always be
+                # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
+                # the following we know that double rounding will have no effect for advanced scaling
+                # no matter the input, so we can safely use simplified scaling with double rounding disabled.
+                use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
+            else:
+                use_advanced_scaling = True
+            if use_advanced_scaling:
+                # Use advanced implementation only when input/output scales differ,
+                # or when we can't guarantee the absence of rounding errors
+                (
+                    opa_scale,
+                    opa_shift,
+                    ofm_scale,
+                    shift,
+                    op_to_scale,
+                ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
+                opb_scale = 0  # Unused for this case
+                if npu_op.reversed_operands:
+                    # If the operand order is reversed we also have to swap which operand is scaled
+                    if op_to_scale == scaling.OperandToScale.OPa:
+                        op_to_scale = scaling.OperandToScale.OPb
+                    else:
+                        op_to_scale = scaling.OperandToScale.OPa
+            emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
+            emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
+*/
+
+static void
+simplified_elementwise_add_sub_scale(
+   double input1_scale,
+   double input2_scale,
+   double output_scale,
+   uint32_t input_shift,
+   double *out_input1_rescale,
+   double *out_input2_rescale,
+   uint32_t *out_out_scale,
+   uint32_t *out_out_shift)
+{
+   double max_input_scale = MAX2(input1_scale, input2_scale);
+   double input_shift_val = (double)(1LL << input_shift); /* Use 1LL for large shifts */
+
+   *out_input1_rescale = input1_scale * input_shift_val / (2.0 * max_input_scale);
+   *out_input2_rescale = input2_scale * input_shift_val / (2.0 * max_input_scale);
+
+   /*
+    * Be careful with division by zero or very small output_scale if output_scale
+    * can be zero or close to zero.
+    */
+   double output_rescale_val;
+   if (output_scale == 0.0) {
+      /* Handle error or return specific value */
+      output_rescale_val = 0.0; /* Or INFINITY, depending on desired behavior */
+   } else {
+      output_rescale_val = (2.0 * max_input_scale) / (output_scale * input_shift_val);
+   }
+
+   *out_out_scale = ethosu_quantize_scale(output_rescale_val, out_out_shift);
+}
+
+static enum ethosu_op_to_scale
+eltwise_emit_ofm_scaling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   double max_input_scale = MAX2(operation->ifm.scale, operation->ifm2.scale);
+   double min_input_scale = MIN2(operation->ifm.scale, operation->ifm2.scale);
+   unsigned bitdepth = 8;
+   uint32_t input_shift = (bitdepth == 8) ? 20 : 15;
+   double input1_rescale_tmp;
+   double input2_rescale_tmp;
+   unsigned ofm_scale, ofm_shift;
+   unsigned opa_scale, opa_shift;
+
+   simplified_elementwise_add_sub_scale(
+      min_input_scale, max_input_scale, operation->ofm.scale, input_shift,
+      &input1_rescale_tmp, &input2_rescale_tmp,
+      &ofm_scale, &ofm_shift);
+
+   opa_scale = ethosu_quantize_scale(input1_rescale_tmp, &opa_shift);
+
+   EMIT1(NPU_SET_OPA_SCALE, opa_shift, opa_scale);
+   EMIT1(NPU_SET_OPB_SCALE, 0x0, 0x0);
+   EMIT1(NPU_SET_OFM_SCALE, ofm_shift, ofm_scale);
+
+   if (operation->ifm.scale < operation->ifm2.scale)
+      return OP_A;
+   else
+      return OP_B;
+}
+
+static void
+emit_eltwise(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   bool has_scalar = false;
+   enum ethosu_op_to_scale op_to_scale = OP_NONE;
+
+   op_to_scale = eltwise_emit_ofm_scaling(subgraph, operation);
+
+   emit_common(subgraph, operation, op_to_scale);
+
+   emit_ifm2(subgraph, operation, has_scalar);
+   emit_ifm_precision(subgraph, &operation->ifm2, OP_NONE, NPU_SET_IFM2_PRECISION);
+   emit_ifm2_broadcast(subgraph, operation);
+}
+
+static void
+emit_dma(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   EMIT0(NPU_SET_DMA0_SRC_REGION, COEFS_REGION);
+   EMIT1(NPU_SET_DMA0_SRC, 0x0, operation->dma.address);
+   EMIT0(NPU_SET_DMA0_DST_REGION, SCRATCH_REGION);
+   EMIT1(NPU_SET_DMA0_DST, 0x0, 0x0);
+   EMIT1(NPU_SET_DMA0_LEN, 0x0, operation->dma.size);
+}
+
+static void
+emit_operation_code(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   switch (operation->type) {
+   case ETHOSU_OPERATION_TYPE_CONVOLUTION:
+
+      if (operation->conv.depthwise)
+         EMIT0(NPU_OP_DEPTHWISE, 0x0);
+      else
+         EMIT0(NPU_OP_CONV, 0x0);
+
+      break;
+   case ETHOSU_OPERATION_TYPE_POOLING:
+      EMIT0(NPU_OP_POOL, operation->pooling.avg);
+      break;
+   case ETHOSU_OPERATION_TYPE_ELTWISE:
+      EMIT0(NPU_OP_ELEMENTWISE, 0x1);
+      break;
+   case ETHOSU_OPERATION_TYPE_DMA:
+      EMIT0(NPU_OP_DMA_START, 0x0);
+      break;
+   }
+}
+
+static void
+emit_cmd_waits(struct ethosu_subgraph *subgraph, int npu_waits, int dma_waits)
+{
+   if (npu_waits >= 0)
+      EMIT0(NPU_OP_KERNEL_WAIT, npu_waits);
+
+   if (dma_waits >= 0)
+      EMIT0(NPU_OP_DMA_WAIT, dma_waits);
+}
+
+static bool
+ethosu_intersects_accesses(struct ethosu_address_range *a, struct ethosu_address_range *b)
+{
+   for (int i = 0; i < MAX_MEMORY_ACCESSES; i++) {
+      for (int j = 0; j < MAX_MEMORY_ACCESSES; j++) {
+         if (a[i].size == 0 || b[j].size == 0)
+            continue;
+         if (a[i].region != b[j].region)
+            continue;
+         if (a[i].address < b[j].address + b[j].size &&
+             b[j].address < a[i].address + a[i].size)
+            return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+ethosu_operations_conflict(struct ethosu_subgraph *subgraph,
+                           struct ethosu_operation *op1, struct ethosu_operation *op2)
+{
+   /* True dependencies, or write -> read */
+   if (ethosu_intersects_accesses(op1->write_accesses, op2->read_accesses))
+      return true;
+
+   /* Anti-dependencies, or read -> write */
+   if (ethosu_intersects_accesses(op1->read_accesses, op2->write_accesses))
+      return true;
+
+   /* Output dependencies, or write -> write */
+   if (ethosu_intersects_accesses(op1->write_accesses, op2->write_accesses))
+      return true;
+
+   /* read -> read does not cause a conflict */
+   return false;
+}
+
+static void
+get_wait_dependency(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation,
+                    struct util_dynarray *outstanding_dma_ops,
+                    struct util_dynarray *outstanding_npu_ops,
+                    int *npu_waits, int *dma_waits)
+{
+   unsigned kern_wait = -1;
+   unsigned dma_wait = -1;
+   struct util_dynarray *outstanding_ops = NULL;
+
+   if (operation->type == ETHOSU_OPERATION_TYPE_DMA) {
+      outstanding_ops = outstanding_npu_ops;
+
+      util_dynarray_append(outstanding_dma_ops, struct ethosu_operation *, operation);
+
+      unsigned dmap_ops = util_dynarray_num_elements(outstanding_dma_ops, struct ethosu_operation *);
+      if (dmap_ops > MAX_OUTSTANDING_DMA_OPS)
+         (void)util_dynarray_pop(outstanding_dma_ops, struct ethosu_operation *);
+   } else {
+      outstanding_ops = outstanding_dma_ops;
+
+      util_dynarray_append(outstanding_npu_ops, struct ethosu_operation *, operation);
+
+      unsigned npu_ops = util_dynarray_num_elements(outstanding_npu_ops, struct ethosu_operation *);
+      if (npu_ops > MAX_OUTSTANDING_NPU_OPS)
+         (void)util_dynarray_pop(outstanding_npu_ops, struct ethosu_operation *);
+   }
+
+   unsigned waits = -1;
+   for (int idx = util_dynarray_num_elements(outstanding_ops, struct ethosu_operation *) - 1; idx >= 0; idx--) {
+      waits += 1;
+      struct ethosu_operation *other_op = *util_dynarray_element(outstanding_ops, struct ethosu_operation *, idx);
+      if (other_op == operation)
+         continue;
+      if (ethosu_operations_conflict(subgraph, other_op, operation)) {
+         if (operation->type == ETHOSU_OPERATION_TYPE_DMA)
+            kern_wait = waits;
+         else
+            dma_wait = waits;
+         // Current op needs to wait, and after it has waited,
+         // outstanding_ops[0..idx] are not outstanding any longer.
+         for (int i = 0; i <= idx; i++)
+            (void)util_dynarray_pop(outstanding_ops, struct ethosu_operation *);
+         break;
+      }
+   }
+
+   *npu_waits = kern_wait;
+   *dma_waits = dma_wait;
+}
+
+static void
+fill_memory_accesses(struct ethosu_subgraph *subgraph)
+{
+   util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
+      switch (operation->type) {
+      case ETHOSU_OPERATION_TYPE_DMA:
+         operation->read_accesses[0].region = COEFS_REGION;
+         operation->read_accesses[0].address = operation->dma.address;
+         operation->read_accesses[0].size = operation->dma.size;
+
+         operation->write_accesses[0].region = SCRATCH_REGION;
+         operation->write_accesses[0].address = 0x0;
+         operation->write_accesses[0].size = operation->dma.size;
+
+         break;
+      default:
+         operation->read_accesses[0].region = IO_REGION;
+         operation->read_accesses[0].address = operation->ifm.tiles.addresses[0];
+         operation->read_accesses[0].size = operation->ifm.shape.height * operation->ifm.shape.width * operation->ifm.shape.depth;
+
+         operation->read_accesses[1].region = IO_REGION;
+         operation->read_accesses[1].address = operation->ifm2.tiles.addresses[0];
+         operation->read_accesses[1].size = operation->ifm2.shape.height * operation->ifm2.shape.width * operation->ifm2.shape.depth;
+
+         operation->read_accesses[2].region = operation->conv.scales.region;
+         operation->read_accesses[2].address = operation->conv.scales.address;
+         operation->read_accesses[2].size = operation->conv.scales.size;
+
+         operation->read_accesses[3].region = operation->conv.weights.region;
+         operation->read_accesses[3].address = operation->conv.weights.address;
+         operation->read_accesses[3].size = operation->conv.weights.size;
+
+         operation->write_accesses[0].region = IO_REGION;
+         operation->write_accesses[0].address = operation->ofm.tiles.addresses[0];
+         operation->write_accesses[0].size = operation->ofm.shape.height * operation->ofm.shape.width * operation->ofm.shape.depth;
+         break;
+      }
+   }
+}
+
+static unsigned
+calc_blockdep(struct ethosu_subgraph *subgraph, struct ethosu_operation *prev_op, struct ethosu_operation *operation)
+{
+   if (!prev_op)
+      return 0;
+
+   // Check if the reserved shram will be used in current/prev op
+   bool prev_uses_lut = false; // prev_op->activation && prev_op->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
+   bool curr_uses_lut = false; // operation->activation && operation->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
+   if (prev_uses_lut && SHRAM_RESERVED_UNUSED_BANKS == 0 && !curr_uses_lut)
+      return 0;
+
+   return MAX_BLOCKDEP; /* TODO: Check if there is actually overlap between the FMs */
+}
+
+void
+ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph)
+{
+   struct ethosu_operation *prev_op = NULL;
+   struct util_dynarray outstanding_dma_ops;
+   struct util_dynarray outstanding_npu_ops;
+
+   util_dynarray_init(&outstanding_dma_ops, NULL);
+   util_dynarray_init(&outstanding_npu_ops, NULL);
+
+   subgraph->cmdstream_used = 32;
+   subgraph->cmdstream = calloc(subgraph->cmdstream_used, sizeof(*subgraph->cmdstream));
+   subgraph->cursor = subgraph->cmdstream;
+
+   fill_memory_accesses(subgraph);
+
+   /* Compile */
+
+   if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen)))
+      EMIT0(NPU_SET_PARALLEL_MODE, 0x0);
+
+   util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
+
+      int npu_waits, dma_waits;
+
+      get_wait_dependency(subgraph, operation, &outstanding_dma_ops, &outstanding_npu_ops,
+                          &npu_waits, &dma_waits);
+
+      switch (operation->type) {
+      case ETHOSU_OPERATION_TYPE_CONVOLUTION:
+         emit_convolution(subgraph, operation);
+         break;
+      case ETHOSU_OPERATION_TYPE_POOLING:
+         emit_pooling(subgraph, operation);
+         break;
+      case ETHOSU_OPERATION_TYPE_ELTWISE:
+         emit_eltwise(subgraph, operation);
+         break;
+      case ETHOSU_OPERATION_TYPE_DMA:
+         emit_dma(subgraph, operation);
+         break;
+      }
+
+      if (operation->type != ETHOSU_OPERATION_TYPE_DMA) {
+         unsigned blockdep = calc_blockdep(subgraph, prev_op, operation);
+         blockdep = MIN2(blockdep, MAX_BLOCKDEP);
+         EMIT0(NPU_SET_BLOCKDEP, blockdep);
+
+         prev_op = operation;
+      }
+
+      emit_cmd_waits(subgraph, npu_waits, dma_waits);
+      emit_operation_code(subgraph, operation);
+   }
+
+   EMIT0(NPU_OP_STOP, 0xffff);
+
+   util_dynarray_fini(&outstanding_dma_ops);
+   util_dynarray_fini(&outstanding_npu_ops);
+}
diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.h b/src/gallium/drivers/ethosu/ethosu_cmd.h
new file mode 100644
index 00000000000..372391eac69
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_cmd.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ETHOSU_CMD_H
+#define ETHOSU_CMD_H
+
+#include "ethosu_ml.h"
+
+void ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph);
+
+#endif /* ETHOSU_CMD_H */
diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.c b/src/gallium/drivers/ethosu/ethosu_coefs.c
new file mode 100644
index 00000000000..a46cc3370cd
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_coefs.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/u_inlines.h"
+
+#include "mlw_codec/mlw_encode.h"
+#include "ethosu_coefs.h"
+
+static void
+fill_scale_and_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **scales, long *scales_size, struct pipe_resource *bias_rsrc)
+{
+   struct pipe_transfer *transfer_in;
+   int32_t *biases = pipe_buffer_map(subgraph->base.context, bias_rsrc,
+                                     PIPE_MAP_READ, &transfer_in);
+   unsigned idx = 0;
+
+   *scales_size = ALIGN(operation->ofm.shape.depth * 10, 16);
+   *scales = malloc(*scales_size);
+   memset(*scales, 0, *scales_size);
+
+   for (unsigned i = 0; i < operation->ofm.shape.depth; i++) {
+      uint64_t bias = biases[i];
+      double conv_scale = ((double)operation->ifm.scale * (double)operation->kernel.scale) / (double)operation->ofm.scale;
+      uint32_t shift;
+      int scale = ethosu_quantize_scale(conv_scale, &shift);
+
+      (*scales)[idx++] = (bias >> (0 * 8)) & 0xFF;
+      (*scales)[idx++] = (bias >> (1 * 8)) & 0xFF;
+      (*scales)[idx++] = (bias >> (2 * 8)) & 0xFF;
+      (*scales)[idx++] = (bias >> (3 * 8)) & 0xFF;
+      (*scales)[idx++] = (bias >> (4 * 8)) & 0xFF;
+
+      (*scales)[idx++] = (scale >> (0 * 8)) & 0xFF;
+      (*scales)[idx++] = (scale >> (1 * 8)) & 0xFF;
+      (*scales)[idx++] = (scale >> (2 * 8)) & 0xFF;
+      (*scales)[idx++] = (scale >> (3 * 8)) & 0xFF;
+
+      (*scales)[idx++] = shift & 0x3F;
+   }
+
+   pipe_buffer_unmap(subgraph->base.context, transfer_in);
+}
+
+static void
+calculate_weights_strides(struct ethosu_operation *operation, int out_strides[4])
+{
+   if (operation->kernel.depthwise) {
+      out_strides[0] = 1;
+      out_strides[1] = operation->ofm.shape.depth * operation->kernel.height;
+      out_strides[2] = operation->ofm.shape.depth;
+      out_strides[3] = operation->ofm.shape.depth * operation->kernel.width;
+   } else {
+      out_strides[3] = 1;
+      out_strides[2] = out_strides[3] * operation->ifm.shape.depth;
+      out_strides[1] = out_strides[2] * operation->kernel.width;
+      out_strides[0] = out_strides[1] * operation->kernel.height;
+   }
+}
+
+static void
+fill_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **weights, long *weights_size, struct pipe_resource *weight_rsrc)
+{
+   int brick_strides[4] = {0};
+   unsigned input_channels = operation->ifm.shape.depth;
+
+   if (operation->kernel.depthwise)
+      input_channels = 1;
+
+   calculate_weights_strides(operation, brick_strides);
+
+   struct pipe_transfer *transfer_in;
+   uint8_t *input_weights_8 = pipe_buffer_map(subgraph->base.context, weight_rsrc,
+                                              PIPE_MAP_READ, &transfer_in);
+   int16_t *input_weights = malloc(pipe_buffer_size(weight_rsrc) * sizeof(*input_weights));
+   for (int i = 0; i < pipe_buffer_size(weight_rsrc); i++) {
+      if (operation->kernel.is_signed)
+         input_weights[i] = (int8_t)input_weights_8[i] - operation->kernel.zero_point;
+      else
+         input_weights[i] = input_weights_8[i] - operation->kernel.zero_point;
+   }
+   pipe_buffer_unmap(subgraph->base.context, transfer_in);
+
+   long padded_size = 0;
+   *weights_size = mlw_reorder_encode(
+      IFM_UBLOCK.depth,
+      OFM_UBLOCK.depth,
+      operation->ofm.shape.depth,
+      operation->kernel.height,
+      operation->kernel.width,
+      input_channels,
+      brick_strides,
+      input_weights,
+      operation->block_config.ofm_block.depth,
+      operation->kernel.depthwise,
+      operation->conv.part_kernel_first,
+      8 /* ifm_bitdepth */,
+      8 /* decomp_h */,
+      8 /* decomp_w */,
+      weights,
+      &padded_size,
+      DBG_ENABLED(ETHOSU_DBG_MSGS));
+
+   free(input_weights);
+}
+
+void
+fill_coefs(struct ethosu_subgraph *subgraph,
+           struct ethosu_operation *operation,
+           struct pipe_resource *bias_rsrc,
+           struct pipe_resource *weight_rsrc)
+{
+   uint8_t *scales = NULL;
+   fill_scale_and_biases(subgraph, operation, &scales, &operation->conv.scales.size, bias_rsrc);
+
+   operation->conv.scales.region = COEFS_REGION;
+   operation->conv.scales.address = subgraph->coefs_used;
+   subgraph->coefs_used += ALIGN_POT(operation->conv.scales.size, 16);
+   subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used);
+   memcpy(subgraph->coefs + operation->conv.scales.address, scales, operation->conv.scales.size);
+   free(scales);
+
+   uint8_t *weights = NULL;
+   fill_weights(subgraph, operation, &weights, &operation->conv.weights.size, weight_rsrc);
+
+   operation->conv.weights.region = COEFS_REGION;
+   operation->conv.weights.address = subgraph->coefs_used;
+   subgraph->coefs_used += ALIGN_POT(operation->conv.weights.size, 16);
+   subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used);
+   memcpy(subgraph->coefs + operation->conv.weights.address, weights, operation->conv.weights.size);
+   free(weights);
+}
diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.h b/src/gallium/drivers/ethosu/ethosu_coefs.h
new file mode 100644
index 00000000000..7b63f11de2b
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_coefs.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ETHOSU_COEFS_H
+#define ETHOSU_COEFS_H
+
+#include "ethosu_ml.h"
+
+void
+fill_coefs(struct ethosu_subgraph *subgraph,
+           struct ethosu_operation *operation,
+           struct pipe_resource *bias_rsrc,
+           struct pipe_resource *weight_rsrc);
+
+#endif /* ETHOSU_COEFS_H */
diff --git a/src/gallium/drivers/ethosu/ethosu_device.c b/src/gallium/drivers/ethosu/ethosu_device.c
new file mode 100644
index 00000000000..392717f85a3
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_device.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ethosu_device.h"
+#include "ethosu_ml.h"
+
+#include "drm-uapi/ethosu_accel.h"
+
+#include <xf86drm.h>
+#include "util/os_mman.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+
+static const struct debug_named_value ethosu_debug_options[] = {
+   {"dbg_msgs", ETHOSU_DBG_MSGS, "Print debug messages"},
+   {"dump_bos", ETHOSU_DBG_DUMP_BOS, "Dump buffers for analysis"},
+   {"zero_bos", ETHOSU_DBG_ZERO, "Zero buffers for debugging"},
+   {"disable_nhcwb16", ETHOSU_DBG_DISABLE_NHCWB16, "Disable NHCWB16"},
+   {"disable_sram", ETHOSU_DBG_DISABLE_SRAM, "Disable SRAM"},
+   DEBUG_NAMED_VALUE_END};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(ethosu_debug, "ETHOSU_DEBUG", ethosu_debug_options, 0)
+int ethosu_debug = 0;
+
+static void
+ethosu_destroy_screen(struct pipe_screen *pscreen)
+{
+   struct ethosu_screen *screen = ethosu_screen(pscreen);
+
+   ralloc_free(screen);
+}
+
+static void
+ethosu_destroy_context(struct pipe_context *pctx)
+{
+   struct ethosu_context *ctx = ethosu_context(pctx);
+
+   ralloc_free(ctx);
+}
+
+static void *
+ethosu_buffer_map(struct pipe_context *pctx,
+                  struct pipe_resource *prsc, unsigned level,
+                  unsigned usage, const struct pipe_box *box,
+                  struct pipe_transfer **out_transfer)
+{
+   struct ethosu_screen *screen = ethosu_screen(pctx->screen);
+   struct ethosu_resource *rsc = ethosu_resource(prsc);
+   struct drm_ethosu_bo_wait bo_wait = {0};
+   struct drm_ethosu_bo_mmap_offset bo_mmap_offset = {0};
+   int ret;
+
+   assert(level == 0);
+   assert(prsc->target == PIPE_BUFFER);
+   assert(box->y == 0);
+   assert(box->z == 0);
+   assert(box->height == 1);
+   assert(box->depth == 1);
+
+   struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
+   transfer->level = level;
+   transfer->usage = usage;
+   transfer->box = *box;
+
+   pipe_resource_reference(&transfer->resource, prsc);
+
+   bo_wait.handle = rsc->handle;
+   bo_wait.timeout_ns = INT64_MAX;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_WAIT, &bo_wait);
+   if (ret == -1)
+      goto free_transfer;
+
+   bo_mmap_offset.handle = rsc->handle;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET, &bo_mmap_offset);
+   if (ret == -1)
+      goto free_transfer;
+
+   uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          screen->fd, bo_mmap_offset.offset);
+   assert(map != MAP_FAILED);
+   if (map == MAP_FAILED)
+      goto free_transfer;
+
+   *out_transfer = transfer;
+
+   return map + box->x;
+
+free_transfer:
+   pipe_resource_reference(&transfer->resource, NULL);
+   ralloc_free(transfer);
+   return NULL;
+}
+
+static void
+ethosu_buffer_unmap(struct pipe_context *pctx,
+                    struct pipe_transfer *transfer)
+{
+   pipe_resource_reference(&transfer->resource, NULL);
+   ralloc_free(transfer);
+}
+
+static struct pipe_context *
+ethosu_create_context(struct pipe_screen *screen,
+                      void *priv, unsigned flags)
+{
+   struct ethosu_context *ctx = rzalloc(NULL, struct ethosu_context);
+   struct pipe_context *pctx = &ctx->base;
+
+   if (!ctx)
+      return NULL;
+
+   pctx->screen = screen;
+   pctx->priv = priv;
+
+   pctx->destroy = ethosu_destroy_context;
+
+   pctx->buffer_map = ethosu_buffer_map;
+   pctx->buffer_unmap = ethosu_buffer_unmap;
+   pctx->resource_copy_region = util_resource_copy_region;
+   pctx->buffer_subdata = u_default_buffer_subdata;
+   pctx->clear_buffer = u_default_clear_buffer;
+
+   pctx->ml_operation_supported = ethosu_ml_operation_supported;
+   pctx->ml_subgraph_create = ethosu_ml_subgraph_create;
+   pctx->ml_subgraph_invoke = ethosu_ml_subgraph_invoke;
+   pctx->ml_subgraph_read_output = ethosu_ml_subgraph_read_outputs;
+   pctx->ml_subgraph_destroy = ethosu_ml_subgraph_destroy;
+
+   return pctx;
+}
+
+static struct pipe_resource *
+ethosu_resource_create(struct pipe_screen *pscreen,
+                       const struct pipe_resource *templat)
+{
+   struct ethosu_screen *screen = ethosu_screen(pscreen);
+   struct drm_ethosu_bo_create arg = {0};
+   struct ethosu_resource *rsc;
+   int ret;
+
+   assert(templat->target == PIPE_BUFFER);
+   assert(templat->height0 == 1);
+   assert(templat->depth0 == 1);
+   assert(templat->array_size == 1);
+
+   rsc = rzalloc(NULL, struct ethosu_resource);
+   if (!rsc)
+      return NULL;
+
+   rsc->base = *templat;
+   rsc->base.screen = pscreen;
+   rsc->base.nr_samples = templat->nr_samples;
+   pipe_reference_init(&rsc->base.reference, 1);
+
+   rsc->bo_size = templat->width0;
+
+   arg.size = templat->width0;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_CREATE, &arg);
+   if (ret < 0)
+      goto free_rsc;
+
+   rsc->handle = arg.handle;
+
+   return &rsc->base;
+
+free_rsc:
+   ralloc_free(rsc);
+   return NULL;
+}
+
+static void
+ethosu_resource_destroy(struct pipe_screen *pscreen,
+                        struct pipe_resource *prsc)
+{
+   struct ethosu_resource *rsc = ethosu_resource(prsc);
+   struct ethosu_screen *screen = ethosu_screen(pscreen);
+   struct drm_gem_close arg = {0};
+   int ret;
+
+   arg.handle = rsc->handle;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
+   assert(ret >= 0);
+
+   ralloc_free(rsc);
+}
+
+static int
+ethosu_screen_get_fd(struct pipe_screen *pscreen)
+{
+   return ethosu_screen(pscreen)->fd;
+}
+
+static void
+dev_query(struct ethosu_screen *screen)
+{
+   int ret;
+   struct drm_ethosu_npu_info *info = &screen->info;
+   struct drm_ethosu_dev_query dev_query = {
+      .type = DRM_ETHOSU_DEV_QUERY_NPU_INFO,
+      .size = sizeof(*info),
+      .pointer = (uintptr_t)info,
+   };
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_DEV_QUERY, &dev_query);
+   assert(ret != -1);
+}
+
+struct pipe_screen *
+ethosu_screen_create(int fd,
+                     const struct pipe_screen_config *config,
+                     struct renderonly *ro)
+{
+   struct ethosu_screen *ethosu_screen;
+   struct pipe_screen *screen;
+
+   ethosu_screen = rzalloc(NULL, struct ethosu_screen);
+   if (!ethosu_screen)
+      return NULL;
+
+   screen = &ethosu_screen->pscreen;
+
+   ethosu_debug = debug_get_option_ethosu_debug();
+
+   ethosu_screen->fd = fd;
+   dev_query(ethosu_screen);
+
+   if (DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM))
+      ethosu_screen->info.sram_size = 0;
+
+   screen->get_screen_fd = ethosu_screen_get_fd;
+   screen->destroy = ethosu_destroy_screen;
+   screen->context_create = ethosu_create_context;
+   screen->resource_create = ethosu_resource_create;
+   screen->resource_destroy = ethosu_resource_destroy;
+
+   return screen;
+}
\ No newline at end of file
diff --git a/src/gallium/drivers/ethosu/ethosu_device.h b/src/gallium/drivers/ethosu/ethosu_device.h
new file mode 100644
index 00000000000..b121661baad
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_device.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "renderonly/renderonly.h"
+#include "util/log.h"
+
+#include "drm-uapi/ethosu_accel.h"
+
+#ifndef ETHOSU_SCREEN_H
+#define ETHOSU_SCREEN_H
+
+enum ethosu_dbg {
+   ETHOSU_DBG_MSGS = BITFIELD_BIT(0),
+   ETHOSU_DBG_DUMP_BOS = BITFIELD_BIT(1),
+   ETHOSU_DBG_ZERO = BITFIELD_BIT(2),
+   ETHOSU_DBG_DISABLE_NHCWB16 = BITFIELD_BIT(3),
+   ETHOSU_DBG_DISABLE_SRAM = BITFIELD_BIT(4),
+};
+
+extern int ethosu_debug;
+
+#define DBG_ENABLED(flag) unlikely(ethosu_debug &(flag))
+
+#define DBG(fmt, ...)                                 \
+   do {                                               \
+      if (DBG_ENABLED(ETHOSU_DBG_MSGS))               \
+         mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
+                   ##__VA_ARGS__);                    \
+   } while (0)
+
+struct ethosu_screen {
+   struct pipe_screen pscreen;
+
+   int fd;
+   struct drm_ethosu_npu_info info;
+};
+
+static inline struct ethosu_screen *
+ethosu_screen(struct pipe_screen *p)
+{
+   return (struct ethosu_screen *)p;
+}
+
+static inline bool
+ethosu_is_u65(struct ethosu_screen *e)
+{
+   return DRM_ETHOSU_ARCH_MAJOR(e->info.id) == 1;
+}
+
+struct ethosu_context {
+   struct pipe_context base;
+};
+
+static inline struct ethosu_context *
+ethosu_context(struct pipe_context *pctx)
+{
+   return (struct ethosu_context *)pctx;
+}
+
+struct ethosu_resource {
+   struct pipe_resource base;
+
+   uint32_t handle;
+   uint64_t phys_addr;
+   uint64_t obj_addr;
+   uint64_t bo_size;
+};
+
+static inline struct ethosu_resource *
+ethosu_resource(struct pipe_resource *p)
+{
+   return (struct ethosu_resource *)p;
+}
+
+struct pipe_screen *ethosu_screen_create(int fd,
+                                         const struct pipe_screen_config *config,
+                                         struct renderonly *ro);
+
+#endif /* ETHOSU_SCREEN_H */
diff --git a/src/gallium/drivers/ethosu/ethosu_lower.c b/src/gallium/drivers/ethosu/ethosu_lower.c
new file mode 100644
index 00000000000..c452ab21a7e
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_lower.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ethosu_lower.h"
+#include "ethosu_coefs.h"
+#include "ethosu_sched.h"
+
+static bool
+is_depthwise(const struct pipe_ml_operation *poperation)
+{
+   unsigned input_channels = poperation->input_tensors[0]->dims[3];
+   unsigned output_channels = poperation->output_tensors[0]->dims[3];
+
+   return poperation->conv.depthwise && input_channels > 1 &&
+          output_channels > 1;
+}
+
+static unsigned
+needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size)
+{
+   if (input_size % stride == 0)
+      return MAX2(filter_size - stride, 0);
+
+   return MAX2(filter_size - (input_size % stride), 0);
+}
+
+static bool
+ethosu_is_part_kernel_first(struct ethosu_operation *operation)
+{
+   // Determine which block traversal strategy has better DPU utilization
+   unsigned kernel_size = operation->kernel.height * operation->kernel.width;
+   unsigned depth = operation->ifm.shape.depth;
+   float depth_utilization = (float)depth / ethosu_round_up_to_multiple(depth, 32);
+   float part_kernel_utilization = ((float)depth / ethosu_round_up_to_multiple(depth, 8));
+   part_kernel_utilization *= (float)kernel_size / ethosu_round_up_to_multiple(kernel_size, 4);
+
+   if (operation->type != ETHOSU_OPERATION_TYPE_CONVOLUTION)
+      return false;
+
+   if (operation->kernel.depthwise)
+      return false;
+
+   // Part-kernel first is always better for ifm depths <= 8
+   if (part_kernel_utilization >= depth_utilization || depth <= 8)
+      return true;
+
+   return false;
+}
+
+static void
+set_feature_maps(struct pipe_tensor *input_tensor,
+                 struct pipe_tensor *output_tensor,
+                 struct ethosu_operation *operation)
+{
+   operation->ifm.tensor_idx = input_tensor->index;
+   operation->ifm.shape.height = input_tensor->dims[1];
+   operation->ifm.shape.width = input_tensor->dims[2];
+   operation->ifm.shape.depth = input_tensor->dims[3];
+   operation->ifm.zero_point = input_tensor->zero_point;
+   operation->ifm.scale = input_tensor->scale;
+   operation->ifm.is_signed = input_tensor->is_signed;
+
+   operation->ofm.tensor_idx = output_tensor->index;
+   operation->ofm.shape.height = output_tensor->dims[1];
+   operation->ofm.shape.width = output_tensor->dims[2];
+   operation->ofm.shape.depth = output_tensor->dims[3];
+   operation->ofm.zero_point = output_tensor->zero_point;
+   operation->ofm.scale = output_tensor->scale;
+   operation->ofm.is_signed = output_tensor->is_signed;
+}
+
+static const struct pipe_ml_operation *
+ethosu_find_first_consumer(const struct pipe_ml_operation *poperations,
+                           unsigned count,
+                           unsigned tensor_index)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+      for (unsigned j = 0; j < poperation->input_count; j++)
+         if (poperation->input_tensors[j]->index == tensor_index)
+            return poperation;
+   }
+
+   return NULL;
+}
+
+static void
+allocate_feature_maps(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   ethosu_allocate_feature_map(subgraph, &operation->ifm);
+   operation->ifm.tiles.height_0 = operation->ifm.shape.height;
+   operation->ifm.tiles.height_1 = operation->ifm.shape.height;
+   operation->ifm.tiles.width_0 = operation->ifm.shape.width;
+
+   ethosu_allocate_feature_map(subgraph, &operation->ofm);
+   operation->ofm.tiles.height_0 = operation->ofm.shape.height;
+   operation->ofm.tiles.height_1 = operation->ofm.shape.height;
+   operation->ofm.tiles.width_0 = operation->ofm.shape.width;
+}
+
+static const struct pipe_ml_operation *
+ethosu_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count,
+                           unsigned tensor_index)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+
+      for (unsigned j = 0; j < poperation->output_count; j++) {
+         if (poperation->output_tensors[j]->index == tensor_index)
+            return poperation;
+      }
+   }
+
+   return NULL;
+}
+
+static void
+ethosu_lower_convolution(struct ethosu_subgraph *subgraph,
+                         const struct pipe_ml_operation *poperation,
+                         struct pipe_tensor *input_tensor,
+                         struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_CONVOLUTION;
+
+   operation->conv.depthwise = is_depthwise(poperation);
+   // operation->padding_same = poperation->conv.padding_same;
+   // operation->stride = poperation->conv.stride_x;
+
+   set_feature_maps(input_tensor, poperation->output_tensors[0], operation);
+
+   operation->kernel.height = poperation->conv.weight_tensor->dims[1];
+   operation->kernel.width = poperation->conv.weight_tensor->dims[2];
+   operation->kernel.stride_y = poperation->conv.stride_y;
+   operation->kernel.stride_x = poperation->conv.stride_x;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+   operation->kernel.depthwise = is_depthwise(poperation);
+   operation->kernel.scale = poperation->conv.weight_tensor->scale;
+   operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point;
+   operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed;
+
+   operation->conv.part_kernel_first = ethosu_is_part_kernel_first(operation);
+
+   if (poperation->conv.padding_same) {
+      unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]);
+      unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]);
+
+      operation->pad.top = vert / 2;
+      operation->pad.left = horiz / 2;
+      operation->pad.bottom = (vert + 1) / 2;
+      operation->pad.right = (horiz + 1) / 2;
+   } else {
+      operation->pad.top = 0;
+      operation->pad.left = 0;
+      operation->pad.bottom = 0;
+      operation->pad.right = 0;
+   }
+
+   allocate_feature_maps(subgraph, operation);
+
+   ethosu_sched_operation(subgraph, operation);
+   fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource);
+}
+
+static void
+ethosu_lower_pooling(struct ethosu_subgraph *subgraph,
+                     const struct pipe_ml_operation *poperation,
+                     struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+
+   operation->kernel.height = poperation->pooling.filter_height;
+   operation->kernel.width = poperation->pooling.filter_width;
+   operation->kernel.stride_y = poperation->pooling.stride_y;
+   operation->kernel.stride_x = poperation->pooling.stride_x;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   if (poperation->pooling.padding_same) {
+      unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height);
+      unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width);
+
+      operation->pad.top = vert / 2;
+      operation->pad.left = horiz / 2;
+      operation->pad.bottom = (vert + 1) / 2;
+      operation->pad.right = (horiz + 1) / 2;
+   } else {
+      operation->pad.top = 0;
+      operation->pad.left = 0;
+      operation->pad.bottom = 0;
+      operation->pad.right = 0;
+   }
+
+   allocate_feature_maps(subgraph, operation);
+   ethosu_sched_operation(subgraph, operation);
+}
+
+static void
+ethosu_lower_concatenation(struct ethosu_subgraph *subgraph,
+                           const struct pipe_ml_operation *poperation,
+                           unsigned input_idx,
+                           struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation);
+   operation->ofm.shape.depth = operation->ifm.shape.depth;
+
+   operation->round_mode = ETHOSU_ROUNDING_NATURAL;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+   for (unsigned i = 0; i < input_idx; i++) {
+      struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx);
+
+      if (tensor->layout == ETHOSU_LAYOUT_NHWC)
+         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3];
+      else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
+         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16);
+      else
+         assert(0 && "Unsupported layout");
+   }
+
+   ethosu_sched_operation(subgraph, operation);
+}
+
+static void
+ethosu_lower_resize(struct ethosu_subgraph *subgraph,
+                    const struct pipe_ml_operation *poperation,
+                    struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+   operation->ifm.zero_point = 0;
+   operation->ofm.zero_point = 0;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   operation->upscale = true;
+
+   allocate_feature_maps(subgraph, operation);
+   ethosu_sched_operation(subgraph, operation);
+}
+
+static void
+ethosu_lower_strided_slice(struct ethosu_subgraph *subgraph,
+                           const struct pipe_ml_operation *poperation,
+                           struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+   operation->ifm.shape = operation->ofm.shape;
+   operation->ifm.zero_point = 0;
+   operation->ofm.zero_point = 0;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+
+   unsigned augmented_coord[5];
+   augmented_coord[0] = 0;
+   for (int i = 0; i < 4; ++i) {
+      augmented_coord[i + 1] = poperation->slice.begin[i];
+   }
+
+   unsigned augmented_strides[5];
+   augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height;
+   augmented_strides[1] = 1;
+   augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width;
+   augmented_strides[3] = operation->ifm.shape.depth;
+   augmented_strides[4] = 1;
+
+   unsigned address_offset = 0;
+   for (int i = 0; i < 5; ++i)
+      address_offset += augmented_coord[i] * augmented_strides[i];
+
+   operation->ifm.tiles.addresses[0] += address_offset;
+
+   ethosu_sched_operation(subgraph, operation);
+}
+
+static void
+ethosu_lower_add(struct ethosu_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation,
+                 struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_ELTWISE;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+
+   operation->ifm2.tensor_idx = poperation->input_tensors[1]->index;
+   operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1];
+   operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2];
+   operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3];
+   operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point;
+   operation->ifm2.scale = poperation->input_tensors[1]->scale;
+   operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+
+   ethosu_allocate_feature_map(subgraph, &operation->ifm2);
+   operation->ifm2.tiles.height_0 = operation->ifm2.shape.height;
+   operation->ifm2.tiles.height_1 = operation->ifm2.shape.height;
+   operation->ifm2.tiles.width_0 = operation->ifm2.shape.width;
+
+   ethosu_sched_operation(subgraph, operation);
+}
+
+static void
+ethosu_lower_dma(struct ethosu_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation,
+                 struct ethosu_operation *conv_operation,
+                 struct ethosu_operation *operation)
+{
+   operation->type = ETHOSU_OPERATION_TYPE_DMA;
+
+   operation->dma.address = conv_operation->conv.scales.address;
+   operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size;
+
+   conv_operation->conv.scales.region = SCRATCH_REGION;
+   conv_operation->conv.scales.address = 0;
+
+   conv_operation->conv.weights.region = SCRATCH_REGION;
+   conv_operation->conv.weights.address = conv_operation->conv.scales.size;
+}
+
+static void
+register_tensors(struct ethosu_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperations,
+                 unsigned count)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+
+      for (unsigned j = 0; j < poperation->input_count; j++) {
+         struct pipe_tensor *ptensor = poperation->input_tensors[j];
+         ethosu_register_tensor(subgraph, ptensor);
+      }
+
+      for (unsigned j = 0; j < poperation->output_count; j++) {
+         struct pipe_tensor *ptensor = poperation->output_tensors[j];
+         ethosu_register_tensor(subgraph, ptensor);
+
+         if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_NHCWB16)) {
+            struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, ptensor->index);
+            if (tensor->shape.depth % 16 == 0 &&
+                ethosu_find_first_consumer(poperations, count, ptensor->index)) {
+               tensor->layout = ETHOSU_LAYOUT_NHCWB16;
+            }
+         }
+      }
+   }
+}
+
+void
+ethosu_lower_graph(struct ethosu_subgraph *subgraph,
+                   const struct pipe_ml_operation *poperations, unsigned count)
+{
+   register_tensors(subgraph, poperations, count);
+
+   /* Lower */
+   for (int i = 0; i < count; i++) {
+      struct ethosu_operation operation = {0};
+
+      switch (poperations[i].type) {
+
+      case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+         struct pipe_tensor *input_tensor = poperations[i].input_tensors[0];
+         const struct pipe_ml_operation *producer = ethosu_find_first_producer(poperations, count, input_tensor->index);
+         bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD;
+
+         if (padded_input) {
+            input_tensor = producer->input_tensors[0];
+         }
+
+         ethosu_lower_convolution(subgraph, &poperations[i], input_tensor, &operation);
+
+         if (padded_input) {
+            operation.pad.top = 1;
+            operation.pad.left = 1;
+         }
+
+         if (operation.conv.scales.size + operation.conv.weights.size <=
+             ethosu_screen(subgraph->base.context->screen)->info.sram_size) {
+            struct ethosu_operation dma_operation = {0};
+            ethosu_lower_dma(subgraph, &poperations[i], &operation, &dma_operation);
+
+            util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                                 dma_operation);
+         }
+
+         util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                              operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_ADD: {
+         ethosu_lower_add(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                              operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_POOLING: {
+         ethosu_lower_pooling(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                              operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: {
+         ethosu_lower_strided_slice(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                              operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
+         for (int j = 0; j < poperations[i].input_count; j++) {
+            ethosu_lower_concatenation(subgraph, &poperations[i], j, &operation);
+            util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                                 operation);
+         }
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_RESIZE: {
+         ethosu_lower_resize(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, struct ethosu_operation,
+                              operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_PAD: {
+         // Just ignore the pad operation for now, as it will be handled by its consumers
+         break;
+      }
+
+      default:
+         DBG("poperation->type %d\n", poperations[i].type);
+         UNREACHABLE("Unsupported ML operation type");
+      }
+   }
+}
\ No newline at end of file
diff --git a/src/gallium/drivers/ethosu/ethosu_lower.h b/src/gallium/drivers/ethosu/ethosu_lower.h
new file mode 100644
index 00000000000..bcaf57b0cbc
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_lower.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ETHOSU_LOWER_H
+#define ETHOSU_LOWER_H
+
+#include "ethosu_ml.h"
+
+void
+ethosu_lower_graph(struct ethosu_subgraph *subgraph,
+                   const struct pipe_ml_operation *poperations, unsigned count);
+
+#endif /* ETHOSU_LOWER_H */
diff --git a/src/gallium/drivers/ethosu/ethosu_ml.c b/src/gallium/drivers/ethosu/ethosu_ml.c
new file mode 100644
index 00000000000..de513f78633
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_ml.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "util/u_inlines.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <xf86drm.h>
+
+#include "drm-uapi/ethosu_accel.h"
+
+#include "ethosu_cmd.h"
+#include "ethosu_lower.h"
+#include "ethosu_ml.h"
+
+struct ethosu_block IFM_UBLOCK = {2, 2, 8};
+struct ethosu_block OFM_UBLOCK = {2, 2, 8};
+struct ethosu_block ARCH_OFM_BLOCK_MAX = {64, 32, 128};
+struct ethosu_block SUB_KERNEL_MAX = {8, 8, 65536};
+
+void
+ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                   int suboperation_nr, int offset, unsigned size)
+{
+   char buffer[255];
+
+   snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
+            suboperation_nr);
+
+   FILE *f = fopen(buffer, "wb");
+   assert(f);
+   fwrite(ptr + offset, 1, size, f);
+   if (ferror(f)) {
+      DBG("Error in writing to file: %s\n", strerror(errno));
+   }
+   fflush(f);
+   fclose(f);
+}
+
+void
+ethosu_register_tensor(struct ethosu_subgraph *subgraph,
+                       const struct pipe_tensor *ptensor)
+{
+   struct ethosu_tensor new_tensor = {0};
+   new_tensor.index = ptensor->index;
+   new_tensor.shape.height = ptensor->dims[1];
+   new_tensor.shape.width = ptensor->dims[2];
+   new_tensor.shape.depth = ptensor->dims[3];
+   new_tensor.layout = ETHOSU_LAYOUT_NHWC;
+   util_dynarray_append(&subgraph->tensors, struct ethosu_tensor, new_tensor);
+}
+
+void
+ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
+{
+   struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
+   unsigned size;
+
+   if (tensor->layout == ETHOSU_LAYOUT_NHWC) {
+      size = tensor->shape.width * tensor->shape.height * tensor->shape.depth;
+   } else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
+      size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16);
+   } else {
+      assert(0 && "Unsupported layout");
+      size = 0; // This should never happen
+   }
+
+   assert(tensor);
+
+   if (tensor->size > 0) {
+      feature_map->tiles.addresses[0] = tensor->offset;
+      return;
+   }
+
+   tensor->offset = subgraph->io_used;
+   tensor->size = size;
+   subgraph->io_used += ALIGN_POT(size, 16);
+
+   feature_map->tiles.addresses[0] = tensor->offset;
+}
+
+struct ethosu_tensor *
+ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx)
+{
+   util_dynarray_foreach (&subgraph->tensors, struct ethosu_tensor, tensor) {
+      if (tensor->index == tensor_idx) {
+         return tensor;
+      }
+   }
+   return NULL;
+}
+
+int
+ethosu_round_up_to_multiple(int a, int b)
+{
+   return ((a + b - 1) / b) * b;
+}
+
+int
+ethosu_round_up_divide(int a, int b)
+{
+   return (a + b - 1) / b;
+}
+
+int
+ethosu_quantize_scale(double scale, uint32_t *shift)
+{
+   int exponent = 0;
+   double significand = frexp(scale, &exponent);
+   uint32_t quantized_scale = round(significand * (double)(1LL << 31));
+   *shift = 31 - exponent;
+   if (*shift > 63) {
+      if (quantized_scale > exp2(*shift - 63)) {
+         quantized_scale = quantized_scale >> (*shift - 63);
+         *shift = 63;
+      } else {
+         // Not possible to get back within bounds, set scale and shift to 0
+         // as the shift would shift away all relevant bits anyway.
+         quantized_scale = 0;
+         *shift = 0;
+      }
+   } else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) {
+      quantized_scale = quantized_scale << (0 - *shift);
+      *shift = 0;
+   }
+
+   return quantized_scale;
+}
+
+static bool
+tensor_quantization_supported(struct pipe_tensor *tensor)
+{
+   /*
+    * Per-axis quantization not supported, for details see:
+    * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
+    */
+   return tensor->scales == NULL && tensor->zero_points == NULL;
+}
+
+bool
+ethosu_ml_operation_supported(struct pipe_context *pcontext,
+                              const struct pipe_ml_operation *operation)
+{
+   bool supported = false;
+
+   switch (operation->type) {
+   case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+      struct pipe_tensor *input_tensor = operation->input_tensors[0];
+      struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
+      struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
+      struct pipe_tensor *output_tensor = operation->output_tensors[0];
+
+      // Dilation and per-axis quantization not yet implemented
+      if (tensor_quantization_supported(input_tensor) &&
+          tensor_quantization_supported(weight_tensor) &&
+          tensor_quantization_supported(bias_tensor) &&
+          tensor_quantization_supported(output_tensor) &&
+          operation->conv.dilation_width_factor == 1 &&
+          operation->conv.dilation_height_factor == 1)
+         supported = true;
+
+      break;
+   }
+   case PIPE_ML_OPERATION_TYPE_ADD:
+      supported = operation->input_tensors[0]->resource == NULL &&
+                  operation->input_tensors[1]->resource == NULL;
+      break;
+   case PIPE_ML_OPERATION_TYPE_POOLING:
+   case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE:
+   case PIPE_ML_OPERATION_TYPE_PAD:
+   case PIPE_ML_OPERATION_TYPE_RESIZE:
+      supported = true;
+      break;
+   case PIPE_ML_OPERATION_TYPE_CONCATENATION:
+      supported = operation->conc.axis == 3 ||
+                  operation->conc.axis == -1;
+      break;
+   default:
+      supported = false;
+   }
+
+   return supported;
+}
+
+struct pipe_ml_subgraph *
+ethosu_ml_subgraph_create(struct pipe_context *pcontext,
+                          const struct pipe_ml_operation *poperations,
+                          unsigned count)
+{
+   struct pipe_screen *pscreen = pcontext->screen;
+   struct ethosu_screen *screen = ethosu_screen(pscreen);
+   struct ethosu_subgraph *subgraph;
+
+   subgraph = calloc(1, sizeof(*subgraph));
+   subgraph->base.context = pcontext;
+
+   util_dynarray_init(&subgraph->tensors, NULL);
+   util_dynarray_init(&subgraph->operations, NULL);
+
+   ethosu_lower_graph(subgraph, poperations, count);
+
+   ethosu_emit_cmdstream(subgraph);
+
+   struct drm_ethosu_cmdstream_bo_create cmd_bo_create = {
+      .size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor),
+      .data = (uintptr_t)subgraph->cmdstream,
+   };
+
+   if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS))
+      ethosu_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor));
+
+   int ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE, &cmd_bo_create);
+   assert(ret == 0);
+
+   free(subgraph->cmdstream);
+
+   subgraph->cmdstream_bo = cmd_bo_create.handle;
+
+   if (subgraph->coefs_used > 0) {
+      subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used);
+      pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs);
+
+      free(subgraph->coefs);
+      subgraph->coefs = NULL;
+
+      if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
+         struct pipe_transfer *transfer_in;
+         uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc,
+                                        PIPE_MAP_READ, &transfer_in);
+         ethosu_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc));
+         pipe_buffer_unmap(subgraph->base.context, transfer_in);
+      }
+   }
+
+   subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used);
+
+   return &subgraph->base;
+}
+
+void
+ethosu_ml_subgraph_invoke(struct pipe_context *pcontext,
+                          struct pipe_ml_subgraph *psubgraph,
+                          unsigned inputs_count, unsigned input_idxs[],
+                          void *inputs[], bool is_signed[])
+{
+   struct ethosu_screen *screen = ethosu_screen(pcontext->screen);
+   struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
+   struct drm_ethosu_submit submit = {0};
+   struct drm_ethosu_job job = {0};
+   struct timespec start, end;
+   int ret;
+
+   for (unsigned i = 0; i < inputs_count; i++) {
+      struct ethosu_tensor *input = ethosu_find_tensor(subgraph, input_idxs[i]);
+      assert(input);
+
+      if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS))
+         ethosu_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
+
+      pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]);
+   }
+
+   if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
+      struct pipe_transfer *transfer_in;
+      uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
+                                     PIPE_MAP_READ, &transfer_in);
+      ethosu_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
+      pipe_buffer_unmap(subgraph->base.context, transfer_in);
+   }
+
+   job.cmd_bo = subgraph->cmdstream_bo;
+
+   if (subgraph->coefs_rsrc) {
+      job.region_bo_handles[COEFS_REGION] = ethosu_resource(subgraph->coefs_rsrc)->handle;
+      if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM)) {
+         job.region_bo_handles[SCRATCH_REGION] = 0;
+         job.sram_size = screen->info.sram_size;
+      }
+   }
+
+   job.region_bo_handles[IO_REGION] = ethosu_resource(subgraph->io_rsrc)->handle;
+
+   submit.jobs = (uintptr_t)&job;
+   submit.job_count = 1;
+
+   if (DBG_ENABLED(ETHOSU_DBG_MSGS))
+      clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_SUBMIT, &submit);
+   assert(ret == 0);
+
+   if (DBG_ENABLED(ETHOSU_DBG_MSGS)) {
+      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+      long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
+      DBG("Submission took %lld ms\n", duration_ns / 1000000);
+
+      /* Force a sync */
+      struct pipe_transfer *transfer_in;
+      pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in);
+      pipe_buffer_unmap(subgraph->base.context, transfer_in);
+
+      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+      duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
+      DBG("Execution took %lld ms\n", duration_ns / 1000000);
+   }
+}
+
+void
+ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                                struct pipe_ml_subgraph *psubgraph,
+                                unsigned outputs_count,
+                                unsigned output_idxs[], void *outputsv[],
+                                bool is_signed[])
+{
+   struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
+   uint8_t **outputs = (uint8_t **)outputsv;
+
+   for (int i = 0; i < outputs_count; i++) {
+      struct ethosu_tensor *output = ethosu_find_tensor(subgraph, output_idxs[i]);
+
+      if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
+         struct pipe_transfer *transfer_in;
+         uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
+                                        PIPE_MAP_READ, &transfer_in);
+         ethosu_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
+         pipe_buffer_unmap(subgraph->base.context, transfer_in);
+      }
+
+      pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]);
+   }
+}
+
+void
+ethosu_ml_subgraph_destroy(struct pipe_context *pcontext,
+                           struct pipe_ml_subgraph *psubgraph)
+{
+   int ret;
+   struct drm_gem_close arg = {0};
+   struct ethosu_screen *screen = ethosu_screen(pcontext->screen);
+   struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
+
+   pipe_resource_reference(&subgraph->io_rsrc, NULL);
+   pipe_resource_reference(&subgraph->coefs_rsrc, NULL);
+
+   arg.handle = subgraph->cmdstream_bo;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
+   assert(ret >= 0);
+
+   util_dynarray_fini(&subgraph->operations);
+   util_dynarray_fini(&subgraph->tensors);
+
+   free(subgraph);
+}
diff --git a/src/gallium/drivers/ethosu/ethosu_ml.h b/src/gallium/drivers/ethosu/ethosu_ml.h
new file mode 100644
index 00000000000..9dc9bbe9869
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_ml.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ETHOSU_ML_H
+#define ETHOSU_ML_H
+
+#include <util/u_dynarray.h>
+
+#include "ethosu_device.h"
+
+#define SHRAM_BANKS                 48
+#define SHRAM_RESERVED_OUTPUT_BANKS 2
+#define SHRAM_RESERVED_UNUSED_BANKS 2
+#define SHRAM_RESERVED_END_BANKS    2
+#define SHRAM_TOTAL_BANKS           SHRAM_BANKS
+#define SHRAM_BANK_SIZE_BYTES       1024
+#define ACC_BITS                    32 /* Use for now always 32-bit accumulators */
+#define IFM_GRANULE                 8
+#define ACC_GRANULE                 16
+#define ARCH_SPLIT_DEPTH            16
+#define BANK_SIZE_BYTES             1024
+#define IFM_GRANULE                 8
+
+extern struct ethosu_block ARCH_OFM_BLOCK_MAX;
+extern struct ethosu_block SUB_KERNEL_MAX;
+extern struct ethosu_block IFM_UBLOCK;
+extern struct ethosu_block OFM_UBLOCK;
+
+#define COEFS_REGION   0
+#define IO_REGION      1
+#define SCRATCH_REGION 2
+
+struct ethosu_block {
+   unsigned width;
+   unsigned height;
+   unsigned depth;
+};
+
+enum ethosu_operation_type {
+   ETHOSU_OPERATION_TYPE_CONVOLUTION,
+   ETHOSU_OPERATION_TYPE_POOLING,
+   ETHOSU_OPERATION_TYPE_ELTWISE,
+   ETHOSU_OPERATION_TYPE_DMA,
+};
+
+struct ethosu_tile_box {
+   unsigned height_0;     /* The height of tile 0 */
+   unsigned height_1;     /* The height of tile 1, 0 if unused */
+   unsigned width_0;      /* The width of tile 0, and tile 2 (if used) */
+   unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */
+};
+
+enum ethosu_layout {
+   ETHOSU_LAYOUT_NHWC,
+   ETHOSU_LAYOUT_NHCWB16,
+};
+
+enum ethosu_rounding_mode {
+   ETHOSU_ROUNDING_DOUBLE = 0,
+   ETHOSU_ROUNDING_TRUNCATE,
+   ETHOSU_ROUNDING_NATURAL,
+};
+struct ethosu_feature_map {
+   unsigned tensor_idx;
+   struct ethosu_block shape;
+   bool is_signed;
+   struct ethosu_tile_box tiles;
+   unsigned zero_point;
+   float scale;
+};
+
+struct ethosu_kernel {
+   unsigned height;
+   unsigned width;
+   unsigned stride_y;
+   unsigned stride_x;
+   unsigned dilation_y;
+   unsigned dilation_x;
+   bool depthwise;
+   bool is_signed;
+   unsigned zero_point;
+   float scale;
+};
+
+struct ethosu_padding {
+   unsigned top;
+   unsigned left;
+   unsigned bottom;
+   unsigned right;
+};
+
+struct ethosu_address_range {
+   unsigned region;
+   unsigned address;
+   long size;
+};
+
+struct ethosu_shram_layout {
+   unsigned ib_start;
+   unsigned ib_end;
+   unsigned ib_start2;
+   unsigned ab_start;
+   unsigned lut_start;
+};
+
+enum ethosu_acc_type {
+   ETHOSU_ACC_TYPE_INT_32BIT = 0,
+   ETHOSU_ACC_TYPE_INT_40BIT,
+   ETHOSU_ACC_TYPE_FP_S5_10,
+};
+
+struct ethosu_block_config {
+   struct ethosu_block ifm_block;
+   struct ethosu_block ofm_block;
+   struct ethosu_shram_layout shram_layout;
+   unsigned bank_size;
+   enum ethosu_acc_type acc_type;
+   bool is_partkernel;
+};
+
+#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/
+
+struct ethosu_operation {
+   enum ethosu_operation_type type;
+
+   struct ethosu_block_config block_config;
+
+   union {
+      struct {
+         struct ethosu_address_range weights;
+         struct ethosu_address_range scales;
+         bool part_kernel_first;
+         bool depthwise;
+      } conv;
+
+      struct {
+         bool avg; /* true for avg, false for max */
+      } pooling;
+
+      struct {
+         unsigned lut_bytes;
+      } eltwise;
+
+      struct {
+         unsigned address;
+         long size;
+      } dma;
+   };
+
+   struct ethosu_feature_map ifm;
+   struct ethosu_feature_map ifm2;
+   struct ethosu_feature_map ofm;
+
+   struct ethosu_kernel kernel;
+   struct ethosu_padding pad;
+   bool upscale;
+   enum ethosu_rounding_mode round_mode;
+
+   struct ethosu_address_range read_accesses[MAX_MEMORY_ACCESSES];
+   struct ethosu_address_range write_accesses[MAX_MEMORY_ACCESSES];
+};
+
+struct ethosu_tensor {
+   unsigned index;
+   unsigned offset;
+   unsigned size;
+   struct ethosu_block shape;
+   enum ethosu_layout layout;
+};
+
+struct ethosu_subgraph {
+   struct pipe_ml_subgraph base;
+
+   struct util_dynarray operations; /* ethosu_operation */
+   struct util_dynarray tensors;    /* ethosu_tensor* */
+
+   unsigned cmdstream_used;
+   uint32_t *cmdstream;
+   uint32_t *cursor;
+   uint32_t cmdstream_bo;
+
+   struct pipe_resource *io_rsrc;
+   unsigned io_used;
+
+   uint8_t *coefs;
+   struct pipe_resource *coefs_rsrc;
+   unsigned coefs_used;
+};
+
+bool
+ethosu_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
+
+struct pipe_ml_subgraph *
+ethosu_ml_subgraph_create(struct pipe_context *pcontext,
+                          const struct pipe_ml_operation *poperations,
+                          unsigned count);
+
+void ethosu_ml_subgraph_invoke(struct pipe_context *pcontext,
+                               struct pipe_ml_subgraph *psubgraph,
+                               unsigned inputs_count, unsigned input_idxs[],
+                               void *inputs[], bool is_signed[]);
+
+void ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                                     struct pipe_ml_subgraph *psubgraph,
+                                     unsigned outputs_count,
+                                     unsigned output_idxs[], void *outputs[],
+                                     bool is_signed[]);
+
+void ethosu_ml_subgraph_destroy(struct pipe_context *context,
+                                struct pipe_ml_subgraph *psubgraph);
+
+void ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map);
+
+void ethosu_register_tensor(struct ethosu_subgraph *subgraph, const struct pipe_tensor *ptensor);
+
+struct ethosu_tensor *ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx);
+
+void ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                        int suboperation_nr, int offset, unsigned size);
+
+int ethosu_round_up_to_multiple(int a, int b);
+
+int ethosu_round_up_divide(int a, int b);
+
+int ethosu_quantize_scale(double scale, uint32_t *shift);
+
+#endif /* ETHOSU_ML_H */
diff --git a/src/gallium/drivers/ethosu/ethosu_sched.c b/src/gallium/drivers/ethosu/ethosu_sched.c
new file mode 100644
index 00000000000..45021362402
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_sched.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ethosu_sched.h"
+
+static int
+required_input_size(int value, int stride, int border)
+{
+   return (value - 1) * stride + border;
+}
+
+static struct ethosu_block
+_get_ifm_blocksize(struct ethosu_operation *operation, struct ethosu_block ofm_block)
+{
+   struct ethosu_block ifm_block = {0};
+
+   // IFM block height
+   int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height));
+   h = ALIGN(h, OFM_UBLOCK.height);
+
+   // IFM block width
+   int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width));
+   w = ALIGN(w, OFM_UBLOCK.width);
+
+   ifm_block.height = h;
+   ifm_block.width = w;
+   ifm_block.depth = ofm_block.depth;
+
+   return ifm_block;
+}
+
+static bool
+try_block_config(struct ethosu_operation *operation, struct ethosu_block ofm_block, struct ethosu_block ifm_block, struct ethosu_shram_layout *layout)
+{
+   int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8);
+   int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
+   int lut_bytes = operation->type == ETHOSU_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0;
+   int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS);
+   int lut_start = SHRAM_TOTAL_BANKS - lut_banks;
+   int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks;
+   int ifm2_start = ifm_end;
+   int acc_start = lut_start;
+
+   if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) {
+      int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8;
+      int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
+      acc_start -= acc_banks;
+   } else {
+      int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */
+
+      if (ifm2_start + ifm2_banks > acc_start)
+         return false;
+
+      ifm_end = acc_start;
+   }
+
+   if (ifm_end > acc_start)
+      return false;
+
+   layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS;
+   layout->ib_start2 = ifm2_start;
+   layout->ib_end = ifm_end;
+   layout->ab_start = acc_start;
+   layout->lut_start = lut_start;
+
+   return true;
+}
+
+static struct ethosu_block_config
+find_block_config(struct ethosu_operation *operation)
+{
+   struct ethosu_block_config config = {};
+   struct ethosu_block search_space = ARCH_OFM_BLOCK_MAX;
+   float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth;
+   float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth;
+   bool is_pooling = operation->type == ETHOSU_OPERATION_TYPE_POOLING;
+   bool is_depthwise = operation->conv.depthwise;
+   bool is_equal_depth = is_pooling || is_depthwise || operation->type == ETHOSU_OPERATION_TYPE_ELTWISE;
+   bool is_convolution = operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION;
+   float best_cost = FLT_MAX;
+   unsigned best_coverage = UINT_MAX;
+
+   search_space.width = MIN2(search_space.width, operation->ofm.shape.width);
+   search_space.height = MIN2(search_space.height, operation->ofm.shape.height);
+   search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth);
+
+   unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH));
+
+   if (depth < operation->ofm.shape.depth) {
+      depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+   }
+
+   search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width);
+   search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height);
+   search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth);
+
+   while (depth <= search_space.depth) {
+      bool wont_fit[search_space.height + 1][search_space.width + 1];
+      memset(wont_fit, 0, sizeof(wont_fit));
+
+      for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) {
+         for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) {
+
+            if (wont_fit[height][width])
+               continue;
+
+            struct ethosu_block ofm_block = {height, width, depth};
+            struct ethosu_block ifm_block = _get_ifm_blocksize(operation, ofm_block);
+
+            if (!is_equal_depth)
+               ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);
+
+            // Try to fit the blocks in SHRAM
+            struct ethosu_shram_layout layout = {0};
+            if (try_block_config(operation, ofm_block, ifm_block, &layout)) {
+
+               struct ethosu_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width),
+                                                  DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height),
+                                                  DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)};
+               float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width,
+                                  operation->ofm.shape.height / (float)ofm_block.height,
+                                  operation->ofm.shape.depth / (float)ofm_block.depth};
+
+               float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0;
+               float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height;
+               if (!is_depthwise)
+                  weight_fetch *= blocks[2] * ofm_block.depth;
+
+               float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1];
+               if (!is_equal_depth)
+                  ifm_fetch *= full_blocks.depth;
+
+               float relative_cost = 0;
+               if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
+                  relative_cost = (ifm_fetch + weight_fetch) / ofm_elements;
+               else
+                  relative_cost = ofm_elements / (height * width * depth);
+
+               if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2)
+                  relative_cost /= 2.0f;
+
+               if (relative_cost <= best_cost) {
+                  bool choose_this = false;
+
+                  if (relative_cost == best_cost) {
+                     struct ethosu_block coverage_shape = {
+                        MIN2(ifm_block.height, operation->ifm.shape.height),
+                        MIN2(ifm_block.width, operation->ifm.shape.width),
+                        MIN2(ifm_block.depth, operation->ifm.shape.depth)};
+                     float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) /
+                                      (float)MAX2(1, coverage_shape.width * coverage_shape.height);
+
+                     if (coverage <= best_coverage && (height <= 4 && width <= 4)) {
+                        best_coverage = coverage;
+                        choose_this = true;
+                     }
+                  } else {
+                     best_coverage = UINT_MAX;
+                     choose_this = true;
+                  }
+
+                  if (choose_this) {
+                     config.shram_layout = layout;
+                     config.ifm_block = ifm_block;
+                     config.ofm_block.height = height;
+                     config.ofm_block.width = width;
+                     config.ofm_block.depth = depth;
+
+                     best_cost = relative_cost;
+                  }
+               }
+            } else {
+               wont_fit[height][width] = true;
+            }
+         }
+      }
+
+      depth += OFM_UBLOCK.depth;
+      if (depth < operation->ofm.shape.depth) {
+         depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+      }
+   }
+
+   return config;
+}
+
+void
+ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
+{
+   operation->block_config = find_block_config(operation);
+}
diff --git a/src/gallium/drivers/ethosu/ethosu_sched.h b/src/gallium/drivers/ethosu/ethosu_sched.h
new file mode 100644
index 00000000000..eb5876fd907
--- /dev/null
+++ b/src/gallium/drivers/ethosu/ethosu_sched.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ETHOSU_SCHED_H
+#define ETHOSU_SCHED_H
+
+#include "ethosu_ml.h"
+
+void ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation);
+
+#endif /* ETHOSU_SCHED_H */
diff --git a/src/gallium/drivers/ethosu/gen_header.py b/src/gallium/drivers/ethosu/gen_header.py
new file mode 100644
index 00000000000..b54516a812c
--- /dev/null
+++ b/src/gallium/drivers/ethosu/gen_header.py
@@ -0,0 +1,125 @@
+#!/usr/bin/python3
+#
+# Copyright © 2019-2024 Google, Inc.
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import time
+import datetime
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def dump_c(args, guard, func):
+	p = Parser()
+
+	try:
+		p.parse(args.rnn, args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	print("#ifndef %s\n#define %s\n" % (guard, guard))
+
+	print("""/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng gen_header.py tool in this git repository:
+http://gitlab.freedesktop.org/mesa/mesa/
+git clone https://gitlab.freedesktop.org/mesa/mesa.git
+
+The rules-ng-ng source files this header was generated from are:
+""")
+	maxlen = 0
+	for filepath in p.xml_files:
+		maxlen = max(maxlen, len(filepath))
+	for filepath in p.xml_files:
+		pad = " " * (maxlen - len(filepath))
+		filesize = str(os.path.getsize(filepath))
+		filesize = " " * (7 - len(filesize)) + filesize
+		filetime = time.ctime(os.path.getmtime(filepath))
+		print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")")
+	if p.copyright_year:
+		current_year = str(datetime.date.today().year)
+		print()
+		print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year))
+		for author in p.authors:
+			print("- " + author)
+	if p.license:
+		print(p.license)
+	print("*/")
+
+	print()
+	print("#ifdef __KERNEL__")
+	print("#include <linux/bug.h>")
+	print("#define assert(x) BUG_ON(!(x))")
+	print("#else")
+	print("#include <assert.h>")
+	print("#endif")
+	print()
+
+	print("#ifdef __cplusplus")
+	print("#define __struct_cast(X)")
+	print("#else")
+	print("#define __struct_cast(X) (struct X)")
+	print("#endif")
+	print()
+
+	func(p)
+
+	print("\n#endif /* %s */" % guard)
+
+
+def dump_c_defines(args):
+	guard = str.replace(os.path.basename(args.xml), '.', '_').upper()
+	dump_c(args, guard, lambda p: p.dump())
+
+
+def dump_c_pack_structs(args):
+	guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS'
+	dump_c(args, guard, lambda p: p.dump_structs())
+
+
+def dump_py_defines(args):
+	p = Parser()
+
+	try:
+		p.parse(args.rnn, args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	file_name = os.path.splitext(os.path.basename(args.xml))[0]
+
+	print("from enum import IntEnum")
+	print("class %sRegs(IntEnum):" % file_name.upper())
+
+	os.path.basename(args.xml)
+
+	p.dump_regs_py()
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--rnn', type=str, required=True)
+	parser.add_argument('--xml', type=str, required=True)
+
+	subparsers = parser.add_subparsers(required=True)
+
+	parser_c_defines = subparsers.add_parser('c-defines')
+	parser_c_defines.set_defaults(func=dump_c_defines)
+
+	parser_c_pack_structs = subparsers.add_parser('c-pack-structs')
+	parser_c_pack_structs.set_defaults(func=dump_c_pack_structs)
+
+	parser_py_defines = subparsers.add_parser('py-defines')
+	parser_py_defines.set_defaults(func=dump_py_defines)
+
+	args = parser.parse_args()
+	args.func(args)
+
+
+if __name__ == '__main__':
+	main()
diff --git a/src/gallium/drivers/ethosu/gen_parser.py b/src/gallium/drivers/ethosu/gen_parser.py
new file mode 100644
index 00000000000..8cf8676c4bf
--- /dev/null
+++ b/src/gallium/drivers/ethosu/gen_parser.py
@@ -0,0 +1,745 @@
+import xml.parsers.expat
+import sys
+import os
+import collections
+
+class Error(Exception):
+	def __init__(self, message):
+		self.message = message
+
+class Enum(object):
+	def __init__(self, name):
+		self.name = name
+		self.values = []
+
+	def has_name(self, name):
+		for (n, value) in self.values:
+			if n == name:
+				return True
+		return False
+
+	def dump(self):
+		use_hex = False
+		for (name, value) in self.values:
+			if value > 0x1000:
+				use_hex = True
+
+		print("enum %s {" % self.name)
+		for (name, value) in self.values:
+			if use_hex:
+				print("\t%s = 0x%08x," % (name, value))
+			else:
+				print("\t%s = %d," % (name, value))
+		print("};\n")
+
+	def dump_pack_struct(self):
+		pass
+
+class Field(object):
+	def __init__(self, name, low, high, shr, type, parser):
+		self.name = name
+		self.low = low
+		self.high = high
+		self.shr = shr
+		self.type = type
+
+		builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ]
+
+		maxpos = parser.current_bitsize - 1
+
+		if low < 0 or low > maxpos:
+			raise parser.error("low attribute out of range: %d" % low)
+		if high < 0 or high > maxpos:
+			raise parser.error("high attribute out of range: %d" % high)
+		if high < low:
+			raise parser.error("low is greater than high: low=%d, high=%d" % (low, high))
+		if self.type == "boolean" and not low == high:
+			raise parser.error("booleans should be 1 bit fields")
+		elif self.type == "float" and not (high - low == 31 or high - low == 15):
+			raise parser.error("floats should be 16 or 32 bit fields")
+		elif not self.type in builtin_types and not self.type in parser.enums:
+			raise parser.error("unknown type '%s'" % self.type)
+
+	def ctype(self, var_name):
+		if self.type == None:
+			type = "uint32_t"
+			val = var_name
+		elif self.type == "boolean":
+			type = "bool"
+			val = var_name
+		elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid":
+			type = "uint32_t"
+			val = var_name
+		elif self.type == "int":
+			type = "int32_t"
+			val = var_name
+		elif self.type == "fixed":
+			type = "float"
+			val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+		elif self.type == "ufixed":
+			type = "float"
+			val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+		elif self.type == "float" and self.high - self.low == 31:
+			type = "float"
+			val = "fui(%s)" % var_name
+		elif self.type == "float" and self.high - self.low == 15:
+			type = "float"
+			val = "_mesa_float_to_half(%s)" % var_name
+		elif self.type in [ "address", "waddress" ]:
+			type = "uint64_t"
+			val = var_name
+		else:
+			type = "enum %s" % self.type
+			val = var_name
+
+		if self.shr > 0:
+			val = "(%s >> %d)" % (val, self.shr)
+
+		return (type, val)
+
+def tab_to(name, value):
+	tab_count = (68 - (len(name) & ~7)) // 8
+	if tab_count <= 0:
+		tab_count = 1
+	print(name + ('\t' * tab_count) + value)
+
+def mask(low, high):
+	return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low)
+
+def field_name(reg, f):
+	if f.name:
+		name = f.name.lower()
+	else:
+		# We hit this path when a reg is defined with no bitset fields, ie.
+		# 	<reg32 offset="0x88db" name="RB_BLIT_DST_ARRAY_PITCH" low="0" high="28" shr="6" type="uint"/>
+		name = reg.name.lower()
+
+	if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()):
+			name = "_" + name
+
+	return name
+
+class Bitset(object):
+	def __init__(self, name, template):
+		self.name = name
+		self.inline = False
+		if template:
+			self.fields = template.fields[:]
+		else:
+			self.fields = []
+
+	# Get address field if there is one in the bitset, else return None:
+	def get_address_field(self):
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				return f
+		return None
+
+	def dump_regpair_builder(self, reg):
+		print("#ifndef NDEBUG")
+		known_mask = 0
+		for f in self.fields:
+			known_mask |= mask(f.low, f.high)
+			if f.type in [ "boolean", "address", "waddress" ]:
+				continue
+			type, val = f.ctype("fields.%s" % field_name(reg, f))
+			print("    assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low)))
+		print("    assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask))
+		print("#endif\n")
+
+		print("    return (struct fd_reg_pair) {")
+		if reg.array:
+			print("        .reg = REG_%s(__i)," % reg.full_name)
+		else:
+			print("        .reg = REG_%s," % reg.full_name)
+
+		print("        .value =")
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				continue
+			else:
+				type, val = f.ctype("fields.%s" % field_name(reg, f))
+				print("            (%-40s << %2d) |" % (val, f.low))
+		value_name = "dword"
+		if reg.bit_size == 64:
+			value_name = "qword"
+		print("            fields.unknown | fields.%s," % (value_name,))
+
+		address = self.get_address_field()
+		if address:
+			print("        .bo = fields.bo,")
+			print("        .is_address = true,")
+			if f.type == "waddress":
+				print("        .bo_write = true,")
+			print("        .bo_offset = fields.bo_offset,")
+			print("        .bo_shift = %d," % address.shr)
+			print("        .bo_low = %d," % address.low)
+
+		print("    };")
+
+	def dump_pack_struct(self, reg=None):
+		if not reg:
+			return
+
+		prefix = reg.full_name
+
+		print("struct %s {" % prefix)
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				tab_to("    __bo_type", "bo;")
+				tab_to("    uint32_t", "bo_offset;")
+				continue
+			name = field_name(reg, f)
+
+			type, val = f.ctype("var")
+
+			tab_to("    %s" % type, "%s;" % name)
+		if reg.bit_size == 64:
+			tab_to("    uint64_t", "unknown;")
+			tab_to("    uint64_t", "qword;")
+		else:
+			tab_to("    uint32_t", "unknown;")
+			tab_to("    uint32_t", "dword;")
+		print("};\n")
+
+		if reg.array:
+			print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" %
+				  (prefix, prefix))
+		else:
+			print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" %
+				  (prefix, prefix))
+
+		self.dump_regpair_builder(reg)
+
+		print("\n}\n")
+
+		if self.get_address_field():
+			skip = ", { .reg = 0 }"
+		else:
+			skip = ""
+
+		if reg.array:
+			print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" %
+				  (prefix, prefix, prefix, skip))
+		else:
+			print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" %
+				  (prefix, prefix, prefix, skip))
+
+
+	def dump(self, prefix=None):
+		if prefix == None:
+			prefix = self.name
+		for f in self.fields:
+			if f.name:
+				name = prefix + "_" + f.name.upper()
+			else:
+				name = prefix
+
+			if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]:
+				pass
+			elif f.type == "boolean" or (f.type == None and f.low == f.high):
+				tab_to("#define %s" % name, "0x%08x" % (1 << f.low))
+			else:
+				tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high))
+				tab_to("#define %s__SHIFT" % name, "%d" % f.low)
+				type, val = f.ctype("val")
+
+				print("static inline uint32_t %s(%s val)\n{" % (name, type))
+				if f.shr > 0:
+					print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1))
+				print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name))
+		print()
+
+class Array(object):
+	def __init__(self, attrs, domain, variant):
+		if "name" in attrs:
+			self.name = attrs["name"]
+		else:
+			self.name = ""
+		self.domain = domain
+		self.variant = variant
+		self.offset = int(attrs["offset"], 0)
+		self.stride = int(attrs["stride"], 0)
+		self.length = int(attrs["length"], 0)
+		if "usage" in attrs:
+			self.usages = attrs["usage"].split(',')
+		else:
+			self.usages = None
+
+	def dump(self):
+		print("#define _%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.name, self.offset, self.stride))
+
+	def dump_pack_struct(self):
+		pass
+
+	def dump_regpair_builder(self):
+		pass
+
+class Reg(object):
+	def __init__(self, attrs, domain, array, bit_size):
+		self.name = attrs["name"]
+		self.domain = domain
+		self.array = array
+		self.offset = int(attrs["offset"], 0)
+		self.type = None
+		self.bit_size = bit_size
+		if array:
+			self.name = array.name + "_" + self.name
+		self.full_name = self.name
+
+	def dump(self):
+		if self.array:
+			offset = self.array.offset + self.offset
+			print("static inline uint32_t %s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride))
+		else:
+			tab_to("#define %s" % self.full_name, "0x%08x" % self.offset)
+
+		if self.bitset.inline:
+			self.bitset.dump(self.full_name)
+
+	def dump_pack_struct(self):
+		if self.bitset.inline:
+			self.bitset.dump_pack_struct(self)
+
+	def dump_regpair_builder(self):
+		if self.bitset.inline:
+			self.bitset.dump_regpair_builder(self)
+
+	def dump_py(self):
+		print("\tREG_%s = 0x%08x" % (self.full_name, self.offset))
+
+
+class Parser(object):
+	def __init__(self):
+		self.current_array = None
+		self.current_domain = None
+		self.current_prefix = None
+		self.current_prefix_type = None
+		self.current_stripe = None
+		self.current_bitset = None
+		self.current_bitsize = 32
+		# The varset attribute on the domain specifies the enum which
+		# specifies all possible hw variants:
+		self.current_varset = None
+		# Regs that have multiple variants.. we only generated the C++
+		# template based struct-packers for these
+		self.variant_regs = {}
+		# Information in which contexts regs are used, to be used in
+		# debug options
+		self.usage_regs = collections.defaultdict(list)
+		self.bitsets = {}
+		self.enums = {}
+		self.variants = set()
+		self.file = []
+		self.xml_files = []
+		self.copyright_year = None
+		self.authors = []
+		self.license = None
+
+	def error(self, message):
+		parser, filename = self.stack[-1]
+		return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message))
+
+	def prefix(self, variant=None):
+		if self.current_prefix_type == "variant" and variant:
+			return variant
+		elif self.current_stripe:
+			return self.current_stripe + "_" + self.current_domain
+		elif self.current_prefix:
+			return self.current_prefix + "_" + self.current_domain
+		else:
+			return self.current_domain
+
+	def parse_field(self, name, attrs):
+		try:
+			if "pos" in attrs:
+				high = low = int(attrs["pos"], 0)
+			elif "high" in attrs and "low" in attrs:
+				high = int(attrs["high"], 0)
+				low = int(attrs["low"], 0)
+			else:
+				low = 0
+				high = self.current_bitsize - 1
+
+			if "type" in attrs:
+				type = attrs["type"]
+			else:
+				type = None
+
+			if "shr" in attrs:
+				shr = int(attrs["shr"], 0)
+			else:
+				shr = 0
+
+			b = Field(name, low, high, shr, type, self)
+
+			if type == "fixed" or type == "ufixed":
+				b.radix = int(attrs["radix"], 0)
+
+			self.current_bitset.fields.append(b)
+		except ValueError as e:
+			raise self.error(e)
+
+	def parse_varset(self, attrs):
+		# Inherit the varset from the enclosing domain if not overriden:
+		varset = self.current_varset
+		if "varset" in attrs:
+			varset = self.enums[attrs["varset"]]
+		return varset
+
+	def parse_variants(self, attrs):
+		if not "variants" in attrs:
+				return None
+		variant = attrs["variants"].split(",")[0]
+		if "-" in variant:
+			variant = variant[:variant.index("-")]
+
+		varset = self.parse_varset(attrs)
+
+		assert varset.has_name(variant)
+
+		return variant
+
+	def add_all_variants(self, reg, attrs, parent_variant):
+		# TODO this should really handle *all* variants, including dealing
+		# with open ended ranges (ie. "A2XX,A4XX-") (we have the varset
+		# enum now to make that possible)
+		variant = self.parse_variants(attrs)
+		if not variant:
+			variant = parent_variant
+
+		if reg.name not in self.variant_regs:
+			self.variant_regs[reg.name] = {}
+		else:
+			# All variants must be same size:
+			v = next(iter(self.variant_regs[reg.name]))
+			assert self.variant_regs[reg.name][v].bit_size == reg.bit_size
+
+		self.variant_regs[reg.name][variant] = reg
+
+	def add_all_usages(self, reg, usages):
+		if not usages:
+			return
+
+		for usage in usages:
+			self.usage_regs[usage].append(reg)
+
+		self.variants.add(reg.domain)
+
+	def do_validate(self, schemafile):
+		try:
+			from lxml import etree
+
+			parser, filename = self.stack[-1]
+			dirname = os.path.dirname(filename)
+
+			# we expect this to look like <namespace url> schema.xsd.. I think
+			# technically it is supposed to be just a URL, but that doesn't
+			# quite match up to what we do.. Just skip over everything up to
+			# and including the first whitespace character:
+			schemafile = schemafile[schemafile.rindex(" ")+1:]
+
+			# this is a bit cheezy, but the xml file to validate could be
+			# in a child director, ie. we don't really know where the schema
+			# file is, the way the rnn C code does.  So if it doesn't exist
+			# just look one level up
+			if not os.path.exists(dirname + "/" + schemafile):
+				schemafile = "../" + schemafile
+
+			if not os.path.exists(dirname + "/" + schemafile):
+				raise self.error("Cannot find schema for: " + filename)
+
+			xmlschema_doc = etree.parse(dirname + "/" + schemafile)
+			xmlschema = etree.XMLSchema(xmlschema_doc)
+
+			xml_doc = etree.parse(filename)
+			if not xmlschema.validate(xml_doc):
+				error_str = str(xmlschema.error_log.filter_from_errors()[0])
+				raise self.error("Schema validation failed for: " + filename + "\n" + error_str)
+		except ImportError:
+			print("lxml not found, skipping validation", file=sys.stderr)
+
+	def do_parse(self, filename):
+		filepath = os.path.abspath(filename)
+		if filepath in self.xml_files:
+			return
+		self.xml_files.append(filepath)
+		file = open(filename, "rb")
+		parser = xml.parsers.expat.ParserCreate()
+		self.stack.append((parser, filename))
+		parser.StartElementHandler = self.start_element
+		parser.EndElementHandler = self.end_element
+		parser.CharacterDataHandler = self.character_data
+		parser.buffer_text = True
+		parser.ParseFile(file)
+		self.stack.pop()
+		file.close()
+
+	def parse(self, rnn_path, filename):
+		self.path = rnn_path
+		self.stack = []
+		self.do_parse(filename)
+
+	def parse_reg(self, attrs, bit_size):
+		self.current_bitsize = bit_size
+		if "type" in attrs and attrs["type"] in self.bitsets:
+			bitset = self.bitsets[attrs["type"]]
+			if bitset.inline:
+				self.current_bitset = Bitset(attrs["name"], bitset)
+				self.current_bitset.inline = True
+			else:
+				self.current_bitset = bitset
+		else:
+			self.current_bitset = Bitset(attrs["name"], None)
+			self.current_bitset.inline = True
+			if "type" in attrs:
+				self.parse_field(None, attrs)
+
+		variant = self.parse_variants(attrs)
+		if not variant and self.current_array:
+			variant = self.current_array.variant
+
+		self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size)
+		self.current_reg.bitset = self.current_bitset
+
+		if len(self.stack) == 1:
+			self.file.append(self.current_reg)
+
+		if variant is not None:
+			self.add_all_variants(self.current_reg, attrs, variant)
+
+		usages = None
+		if "usage" in attrs:
+			usages = attrs["usage"].split(',')
+		elif self.current_array:
+			usages = self.current_array.usages
+
+		self.add_all_usages(self.current_reg, usages)
+
+	def start_element(self, name, attrs):
+		self.cdata = ""
+		if name == "import":
+			filename = attrs["file"]
+			self.do_parse(os.path.join(self.path, filename))
+		elif name == "domain":
+			self.current_domain = attrs["name"]
+			if "prefix" in attrs:
+				self.current_prefix = self.parse_variants(attrs)
+				self.current_prefix_type = attrs["prefix"]
+			else:
+				self.current_prefix = None
+				self.current_prefix_type = None
+			if "varset" in attrs:
+				self.current_varset = self.enums[attrs["varset"]]
+		elif name == "stripe":
+			self.current_stripe = self.parse_variants(attrs)
+		elif name == "enum":
+			self.current_enum_value = 0
+			self.current_enum = Enum(attrs["name"])
+			self.enums[attrs["name"]] = self.current_enum
+			if len(self.stack) == 1:
+				self.file.append(self.current_enum)
+		elif name == "value":
+			if "value" in attrs:
+				value = int(attrs["value"], 0)
+			else:
+				value = self.current_enum_value
+			self.current_enum.values.append((attrs["name"], value))
+		elif name == "reg32":
+			self.parse_reg(attrs, 32)
+		elif name == "reg64":
+			self.parse_reg(attrs, 64)
+		elif name == "array":
+			self.current_bitsize = 32
+			variant = self.parse_variants(attrs)
+			self.current_array = Array(attrs, self.prefix(variant), variant)
+			if len(self.stack) == 1:
+				self.file.append(self.current_array)
+		elif name == "bitset":
+			self.current_bitset = Bitset(attrs["name"], None)
+			if "inline" in attrs and attrs["inline"] == "yes":
+				self.current_bitset.inline = True
+			self.bitsets[self.current_bitset.name] = self.current_bitset
+			if len(self.stack) == 1 and not self.current_bitset.inline:
+				self.file.append(self.current_bitset)
+		elif name == "bitfield" and self.current_bitset:
+			self.parse_field(attrs["name"], attrs)
+		elif name == "database":
+			self.do_validate(attrs["xsi:schemaLocation"])
+		elif name == "copyright":
+			self.copyright_year = attrs["year"]
+		elif name == "author":
+			self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"])
+
+	def end_element(self, name):
+		if name == "domain":
+			self.current_domain = None
+			self.current_prefix = None
+			self.current_prefix_type = None
+		elif name == "stripe":
+			self.current_stripe = None
+		elif name == "bitset":
+			self.current_bitset = None
+		elif name == "reg32":
+			self.current_reg = None
+		elif name == "array":
+			self.current_array = None
+		elif name == "enum":
+			self.current_enum = None
+		elif name == "license":
+			self.license = self.cdata
+
+	def character_data(self, data):
+		self.cdata += data
+
+	def dump_reg_usages(self):
+		d = collections.defaultdict(list)
+		for usage, regs in self.usage_regs.items():
+			for reg in regs:
+				variants = self.variant_regs.get(reg.name)
+				if variants:
+					for variant, vreg in variants.items():
+						if reg == vreg:
+							d[(usage, variant)].append(reg)
+				else:
+					for variant in self.variants:
+						d[(usage, variant)].append(reg)
+
+		print("#ifdef __cplusplus")
+
+		for usage, regs in self.usage_regs.items():
+			print("template<chip CHIP> constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper()))
+
+		for (usage, variant), regs in d.items():
+			offsets = []
+
+			for reg in regs:
+				if reg.array:
+					for i in range(reg.array.length):
+						offsets.append(reg.array.offset + reg.offset + i * reg.array.stride)
+						if reg.bit_size == 64:
+							offsets.append(offsets[-1] + 1)
+				else:
+					offsets.append(reg.offset)
+					if reg.bit_size == 64:
+						offsets.append(offsets[-1] + 1)
+
+			offsets.sort()
+
+			print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant))
+			for offset in offsets:
+				print("\t%s," % hex(offset))
+			print("};")
+
+		print("#endif")
+
+	def dump(self):
+		enums = []
+		bitsets = []
+		regs = []
+		for e in self.file:
+			if isinstance(e, Enum):
+				enums.append(e)
+			elif isinstance(e, Bitset):
+				bitsets.append(e)
+			else:
+				regs.append(e)
+
+		for e in enums + bitsets + regs:
+			e.dump()
+
+		self.dump_reg_usages()
+
+		print("static inline char *ethosu_get_cmd_name(unsigned domain, uint32_t cmd) {")
+		for e in regs:
+			if e.array:
+				continue
+			domain = 0 if e.domain == "CMD0" else 1
+			print("    if (domain == %d && cmd == 0x%08x) return \"%s\";" % (domain, e.offset, e.full_name))
+		print("    return NULL;")
+		print("}\n")
+
+	def dump_regs_py(self):
+		regs = []
+		for e in self.file:
+			if isinstance(e, Reg):
+				regs.append(e)
+
+		for e in regs:
+			e.dump_py()
+
+
+	def dump_reg_variants(self, regname, variants):
+		# Don't bother for things that only have a single variant:
+		if len(variants) == 1:
+			return
+		print("#ifdef __cplusplus")
+		print("struct __%s {" % regname)
+		# TODO be more clever.. we should probably figure out which
+		# fields have the same type in all variants (in which they
+		# appear) and stuff everything else in a variant specific
+		# sub-structure.
+		seen_fields = []
+		bit_size = 32
+		array = False
+		address = None
+		for variant in variants.keys():
+			print("    /* %s fields: */" % variant)
+			reg = variants[variant]
+			bit_size = reg.bit_size
+			array = reg.array
+			for f in reg.bitset.fields:
+				fld_name = field_name(reg, f)
+				if fld_name in seen_fields:
+					continue
+				seen_fields.append(fld_name)
+				name = fld_name.lower()
+				if f.type in [ "address", "waddress" ]:
+					if address:
+						continue
+					address = f
+					tab_to("    __bo_type", "bo;")
+					tab_to("    uint32_t", "bo_offset;")
+					continue
+				type, val = f.ctype("var")
+				tab_to("    %s" %type, "%s;" %name)
+		print("    /* fallback fields: */")
+		if bit_size == 64:
+			tab_to("    uint64_t", "unknown;")
+			tab_to("    uint64_t", "qword;")
+		else:
+			tab_to("    uint32_t", "unknown;")
+			tab_to("    uint32_t", "dword;")
+		print("};")
+		# TODO don't hardcode the varset enum name
+		varenum = "chip"
+		print("template <%s %s>" % (varenum, varenum.upper()))
+		print("static inline struct fd_reg_pair")
+		xtra = ""
+		xtravar = ""
+		if array:
+			xtra = "int __i, "
+			xtravar = "__i, "
+		print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname))
+		for variant in variants.keys():
+			print("  if (%s == %s) {" % (varenum.upper(), variant))
+			reg = variants[variant]
+			reg.dump_regpair_builder()
+			print("  } else")
+		print("    assert(!\"invalid variant\");")
+		print("}")
+
+		if bit_size == 64:
+			skip = ", { .reg = 0 }"
+		else:
+			skip = ""
+
+		print("#define %s(VARIANT, %s...) __%s<VARIANT>(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip))
+		print("#endif /* __cplusplus */")
+
+	def dump_structs(self):
+		for e in self.file:
+			e.dump_pack_struct()
+
+		for regname in self.variant_regs:
+			self.dump_reg_variants(regname, self.variant_regs[regname])
diff --git a/src/gallium/drivers/ethosu/meson.build b/src/gallium/drivers/ethosu/meson.build
new file mode 100644
index 00000000000..28f696a1bf5
--- /dev/null
+++ b/src/gallium/drivers/ethosu/meson.build
@@ -0,0 +1,33 @@
+# Copyright 2019 Google, Inc
+# SPDX-License-Identifier: MIT
+
+ethosu_registers = custom_target(
+  'ethosu_registers.h',
+  input : ['gen_parser.py', 'gen_header.py', 'registers.xml'],
+  output : 'ethosu_registers.h',
+  command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'],
+  capture : true,
+)
+
+files_ethosu = files(
+  'ethosu_cmd.c',
+  'ethosu_coefs.c',
+  'ethosu_device.c',
+  'ethosu_lower.c',
+  'ethosu_ml.c',
+  'ethosu_sched.c',
+  'mlw_codec/mlw_encode.c',
+)
+
+libethosu = static_library(
+  'ethosu',
+  [files_ethosu, ethosu_registers],
+  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [idep_mesautil, dep_libdrm],
+)
+
+driver_ethosu = declare_dependency(
+  compile_args : '-DGALLIUM_ETHOSU',
+  link_with : [libethosuwinsys, libethosu]
+)
diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h b/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h
new file mode 100644
index 00000000000..4bb38387221
--- /dev/null
+++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef MLW_COMMON_H
+#define MLW_COMMON_H
+
+#define ZDIV_DISABLE        6   // not alternating mode
+#define ZDIV_EOS            7   // indicates end of stream
+
+#define WDIV_UNCOMPRESSED   7   // indicates uncompressed weights
+
+#endif
diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c
new file mode 100644
index 00000000000..47dd132090b
--- /dev/null
+++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c
@@ -0,0 +1,1186 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2020-2022, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <math.h>
+#include "mlw_common.h"
+#include "mlw_encode.h"
+
+#define DPRINTF(...)
+//#define DPRINTF(...) printf(__VA_ARGS__)
+
+#define ZERO_RUN_THRES  4
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#define CHECKED_MALLOC(var, size) { if ( !(var = malloc(size)) ) break; }
+
+typedef struct palette {
+    int16_t lut[32];
+    int16_t inv_lut[512];
+    int palsize;    // number of palette entries
+    int palbits;    // bit width of palette entries
+    int use_zero_runs;    // zeros are coded separately
+    int only_palette;   // no values outside the palette
+    int direct_offset;  // added to the decoded weight index before direct conversion to sign/mag
+    int only_zeros;     // special case that the section is all zeros
+} palette_t;
+
+static int is_power_of_two( int x ) {
+    return ((x-1) & x)==0;
+}
+
+static int round_up_divide(int num, int den)
+{
+    return (num + den - 1) / den;
+}
+
+static int round_up(int num, int den)
+{
+    return round_up_divide(num, den) * den;
+}
+
+static int get_palette_index_bits( int size ) {
+    int i;
+    for(i=7; i>=0; i--)
+        if (size > (1<<i) )
+            return i+1;
+    return 0;
+}
+
+// Search the stream for suitable palette restart positions
+// Return the number of restarts
+static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
+    int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
+    int prev_idx[512];  // For each value, keep track of the index of the previous occurence
+    int *restart_pos;
+    int max_palettes = round_up_divide(size, 64);
+    *palette_restart_positions = NULL;
+
+    // Preliminary allocation of sufficient size
+    restart_pos = (int*)malloc( max_palettes*sizeof(int) );
+    if (!restart_pos) {
+        return -1;
+    }
+    last_restart_idx=0;
+    got_palette=0;
+    restart_i=1;
+    restart_pos[0] = 0;
+    zero_cnt=0;
+    memset(prev_idx, -1, sizeof(prev_idx));
+    for(i=0; i<size; i++) {
+        // Guess if zeros should be excluded from the palette
+        int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
+
+        if (got_palette) {
+            // Check if the next value is not covered by the current palette
+            if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
+                // New value: increase the palette size
+                palette_size++;
+                DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
+                if ( is_power_of_two(palette_size-1-exclude_zero) ) {
+                    if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
+                        // create a new palette because we extend a long lasting palette to require one more index bit
+                        DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
+                        if (restart_i == max_palettes) {
+                            max_palettes = max_palettes*2;
+                            restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
+                            if (!restart_pos) {
+                                return -1;
+                            }
+                        }
+                        DPRINTF("restart %d pos %d\n", restart_i, i);
+                        restart_pos[restart_i++] = i;
+                        last_restart_idx = i;
+                        got_palette=0;
+                        zero_cnt=0;
+                    }
+                }
+            }
+        }
+
+        prev_idx[ buf[i]+256 ] = i;
+        if (buf[i]==0)
+            zero_cnt++;
+
+        static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
+        int k;
+        // loop over window sizes
+        for(k=0; k<5; k++) {
+            // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
+            int N = window_sizes[k][0] * (got_palette?2:1);
+            if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
+                // Search backward to the position N nonzero values earlier
+                int nzcnt=0;
+                for( j=i; j>last_restart_idx; j--) {
+                    if ( buf[j]!=0 ) {
+                        if (nzcnt==N+1)
+                            break;
+                        nzcnt++;
+                    }
+                }
+                int restart_idx = j;
+
+                // Calculate the size of a new palette (starting at restart_idx)
+                int new_palette_size=0;
+                for(j=0; j<512; j++) {
+                    if ( prev_idx[j] >= restart_idx ) {
+                        new_palette_size++;
+                    }
+                }
+
+                int create_new_palette=0;
+                if (got_palette) {
+                    int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+                    int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
+                    int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
+                    if ( savings>0 ) {
+                        // Create new palette because it can be smaller than the existing palette
+                        create_new_palette=1;
+                        DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
+                    }
+                } else {
+                    if ( (new_palette_size-exclude_zero) <= 32) {
+                        int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+                        // estimate if we will make savings by using palette mode
+                        int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
+                        create_new_palette = savings>0;
+                    }
+                }
+                if (create_new_palette) {
+                    palette_size=new_palette_size;
+                    got_palette=1;
+                    last_restart_idx = restart_idx;
+                    DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
+                    if ( restart_pos[restart_i-1] != last_restart_idx) {
+                        if (restart_i == max_palettes) {
+                            max_palettes = max_palettes*2;
+                            restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
+                            if (!restart_pos) {
+                                return -1;
+                            }
+                        }
+                        restart_pos[restart_i++] = last_restart_idx;
+                    }
+                    zero_cnt=0;
+                    for( j=last_restart_idx; j<=i; j++)
+                        if (buf[j]==0)
+                            zero_cnt++;
+                }
+            }
+        }
+    }
+    // Reallocate to actual size
+    *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
+    return *palette_restart_positions ? restart_i : -1;
+}
+
+// Calculate frequency table
+static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
+    int i;
+    memset(freq, 0, 512*sizeof(int));
+    for(i=0; i<size; i++) {
+        freq[buf[i]+256]++;
+    }
+}
+
+static int cmp_uint64(const void * a, const void * b) {
+   uint64_t aa = *(const uint64_t*)a;
+   uint64_t bb = *(const uint64_t*)b;
+   return  aa>bb ? -1 : aa<bb ? 1 : 0;
+}
+
+// Create palette from the given frequencies
+// Freq index 0-511 correspond to weights -256..255
+static void create_palette( int freq[512],
+                           int use_zero_runs,
+                           palette_t *p ) {
+    uint64_t freq64[512];
+    int i,all_cnt,all_max_val;
+
+    // Pair the frequency with the value so that
+    // the array can be sorted on frequency while keeping
+    // track of the corresponding palette value
+    memset(freq64, 0, sizeof(freq64));
+    all_cnt=0;
+    all_max_val=0;
+    for(i=-255; i<256; i++) {
+        if (i==0 && use_zero_runs)
+            continue;
+        int sign = i<0;
+        int mag = abs(i);
+        int palval = (mag<<1) | sign;
+
+        // Store palette value in 16 LSB bits, which will not affect the sorting
+        freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
+        all_cnt+=freq[i+256];
+
+        if (freq[i+256]>0) {
+            all_max_val = max(all_max_val, palval);
+        }
+    }
+
+    // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
+    for(i=0; i<31; i++) {
+        if ((freq64[i]>>16)!=0)
+            break;
+    }
+    p->direct_offset = i;
+
+    // Sort in descending frequency order
+    qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
+
+    // Identify special case that there are no weights to code
+    // in the weight index stream (i.e. all weights are zeros)
+    p->only_zeros = (freq64[0]>>16)==0;
+    if (p->only_zeros) {
+        p->direct_offset=0;
+    }
+
+    // Check if all weights fit into the palette (and the palette is not empty)
+    p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
+
+    int max_palette_size;
+    if (p->only_palette) {
+        max_palette_size = 32;
+    } else {
+        // For direct-lut we must make sure that the encoded weight
+        // index is not > 511. We do that by limiting the palette size
+        // such that the greatest value can be reached after subtracting
+        // the palette size.
+        max_palette_size = min(32, 511-all_max_val);
+        if (max_palette_size==1) {
+            max_palette_size=0; // because palette of size 1 is not supported
+        }
+    }
+
+    // Setup the 32 entry palette
+    int16_t palette_max_val = 0, val;
+    int cnt, pal_cnt=0;
+    for(i=0; i<max_palette_size; i++) {
+        cnt = (int)(freq64[i]>>16);
+        val = freq64[i]&0xffff;
+        if ( cnt==0 )
+            break;
+        p->lut[i] = val;
+        palette_max_val = max(palette_max_val, val);
+        pal_cnt+=cnt;
+    }
+    if (i==1)
+        p->lut[i++] = 0; // palette size of 1 is not supported, make it 2
+
+    // Heuristic for when to use the palette. If more than half of the
+    // weights are in the palette then we use it. This ensures we don't
+    // use palette for e.g. rectangular distributions.
+    int palbits_val;
+    if (pal_cnt > all_cnt/2) {
+        p->palsize  =  i;
+        palbits_val = palette_max_val;
+    } else {
+        // No palette
+        p->palsize  =  0;
+        // If no palette, then palbits is used to specify the
+        // number of bits required for uncompressed mode, i.e.
+        // the number of bits for the greatest weight value
+        palbits_val = all_max_val;
+    }
+
+    // the palette entry bit width
+    // minimum 2bits (because PALBITS is in range 2..9)
+    int palbits=2;
+    while( (1<<palbits) <= palbits_val )
+        palbits++;
+    assert(palbits<=9);
+    p->palbits  = palbits;
+    p->use_zero_runs  = use_zero_runs;
+}
+
+// Return 1 if zero runs should be used
+// If palette_size is 512, then palette is not used (in that case the palette is setup
+// with the standard alternating unsigned to signed mapping)
+static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
+    int freq[512], i;
+
+    // Calculate frequencies of the given weight stream
+    calc_freq( inbuf, inbuf_size, freq);
+
+    // Find two most common values
+    int most_common_freq[2]={0}, most_common_val[2]={0};
+    for(i=0; i<512; i++) {
+        if ( freq[i] > most_common_freq[0] ) {
+            most_common_freq[1] = most_common_freq[0];
+            most_common_val[1]  = most_common_val[0];
+            most_common_freq[0] = freq[i];
+            most_common_val[0]  = i-256;
+        } else if ( freq[i] > most_common_freq[1] ) {
+            most_common_freq[1] = freq[i];
+            most_common_val[1]  = i-256;
+        }
+    }
+
+    // Decide if zero-runs (alternating mode) should be used:
+    // * zero should be the most common symbol
+    // * zero should be sufficiently more common than the second most common symbol
+    int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
+
+    // Create the palette
+    create_palette( freq, use_zero_runs, p);
+
+    return use_zero_runs;
+}
+
+static void create_inverse_palette( palette_t *p) {
+    int i;
+    memset( p->inv_lut, 0, sizeof(p->inv_lut));
+    for(i=0; i<512; i++) {
+        int val  = i;
+        int sign = val&1;
+        int mag  = val>>1;
+        int weight = sign ? -mag : mag;
+        int index = weight+256;
+        if (index >= 0 && index < 512)
+            p->inv_lut[ index ] = i + p->palsize - p->direct_offset;
+    }
+    for(i=0; i<p->palsize; i++) {
+        int val = p->lut[i];
+        int sign = val&1;
+        int mag  = val>>1;
+        int weight = sign ? -mag : mag;
+        int index = weight+256;
+        assert(index >= 0 && index < 512);
+        if (index >= 0 && index < 512)
+            p->inv_lut[ index ] = i;
+    }
+}
+
+#define NWCFG 13
+#define NZCFG 4 // restrict search to ZDIV=0..3
+#define MAX_ZWCFG (max(NWCFG,NZCFG))
+
+// search state
+typedef struct search_state {
+    int bitcnt;             // number of bits to reach this state
+    uint8_t prev_cfg;       // previous grc parameter config
+} search_state_t;
+
+// (trunc<<4) | div, 0x20 means uncompressed
+static const uint8_t w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
+static const uint8_t z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
+
+
+
+// An algorithm similar to the Viterbi algorithm is used to search for a
+// good GRC parameter sequence for the given input value sequence.
+// The inval buffer can contain weights, weight indices or runs.
+// The return value is the resulting number of bitstream sections.
+static int search_grc_params( const int *inval_buf,
+                              int n_inval,
+                              int zrun_mode,
+                              int uncompressed_bits,
+                              uint8_t *grc_param_cfg,
+                              int *grc_param_pos,
+                              int max_grc_param_cfg,
+                              int *existing_grc_param_pos,
+                              int n_existing_grc_param_pos,
+                              int *bitcnt )
+{
+    int n_cfg = zrun_mode ? NZCFG : NWCFG;
+    const uint8_t *grc_params = zrun_mode ? z_grc_params : w_grc_params;
+    int i,j;
+
+    search_state_t *state[MAX_ZWCFG];
+    for(i=0; i<n_cfg; i++) {
+        CHECKED_MALLOC(state[i], sizeof(search_state_t) * (n_inval + 1));
+        state[i][0].bitcnt=0;
+        state[i][0].prev_cfg=i;
+    }
+
+    if ( i < n_cfg ) {  // Memory allocation failed - clean up and exit
+        while ( i ) {
+            free(state[--i]);
+        }
+        return -1;
+    }
+
+    // Loop over inval_buf
+    int existing_idx=0;
+    for(i=0; i<n_inval; i++) {
+        int value = inval_buf[i];
+
+        // Best GRC parameter so far
+        int best_bitcnt=0x7fffffff, best_cfg=0;
+        for(j=0; j<n_cfg; j++) {
+            if (state[j][i].bitcnt < best_bitcnt) {
+                best_bitcnt = state[j][i].bitcnt;
+                best_cfg = j;
+            }
+        }
+
+        int cmd_cost = 40;
+        if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
+            // free transition, because the weight stream already inserted a command at this position
+            cmd_cost = 0;
+            existing_idx++;
+        }
+
+        // Loop over GRC parameters, calculate bits to code value, and then update the search state
+        for(j=0; j<n_cfg; j++) {
+            int div = grc_params[j]&15;
+            int trunc = grc_params[j]>>4;
+            int q = value>>div;
+            int bits = trunc ? min(q+1,2) + div : q+1+div;
+            if (!zrun_mode && ((trunc && q>2) || q>31))
+                bits=10000;  // it's not possible to code the current value; give it a high cost
+            if (trunc==2)
+                bits=uncompressed_bits;
+
+            if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
+                // Change GRC parameters
+                state[j][i+1].prev_cfg  = best_cfg;
+                state[j][i+1].bitcnt    = best_bitcnt + cmd_cost + bits;
+            } else {
+                // Keep same GRC parameters
+                state[j][i+1].prev_cfg  = j;
+                state[j][i+1].bitcnt    = state[j][i].bitcnt + bits;
+            }
+        }
+    }
+
+
+    // Best GRC parameter
+    int best_bitcnt=0x7fffffff, best_cfg=0;
+    for(j=0; j<n_cfg; j++) {
+        if (state[j][n_inval].bitcnt < best_bitcnt) {
+            best_bitcnt = state[j][n_inval].bitcnt;
+            best_cfg = j;
+        }
+    }
+
+    int cfg = best_cfg;
+    int n_cmds=0;
+    for(i=n_inval; i>=0; i--) {
+        if (state[cfg][i].prev_cfg != cfg || i==0) {
+            n_cmds++;
+            cfg = state[cfg][i].prev_cfg;
+        }
+    }
+
+    (void)(max_grc_param_cfg);
+    assert(n_cmds<=max_grc_param_cfg);
+
+    cfg = best_cfg;
+    j=n_cmds-1;
+    int endpos=n_inval;
+    for(i=n_inval; i>=0; i--) {
+        if (state[cfg][i].prev_cfg != cfg || i==0) {
+            grc_param_cfg[j] = cfg;
+            grc_param_pos[j] = endpos;
+            j--;
+            cfg = state[cfg][i].prev_cfg;
+            endpos = i-1;
+        }
+    }
+    assert(j==-1);
+
+    for(i=0; i<n_cfg; i++) {
+        free(state[i]);
+    }
+
+    *bitcnt = best_bitcnt;
+
+    return n_cmds;
+}
+
+
+/////////////////////////////// Write to bitstream
+
+typedef struct bitbuf {
+    uint8_t *buf;
+    int buf_size;               // in bytes
+    int pos;                    // bit pos of next bit
+    int log_symbols;
+} bitbuf_t;
+
+// size in byte
+static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
+    bb->buf  = buf;
+    bb->pos  = 0;
+    bb->buf_size = size;
+    bb->log_symbols = log_symbols;
+}
+
+static void bitbuf_putbit( bitbuf_t *bb, uint8_t bit) {
+    int byte_pos = bb->pos>>3;
+    uint8_t bit_pos = bb->pos&7;
+    assert( byte_pos >= 0 );
+    assert( byte_pos < bb->buf_size );
+    bb->buf[ byte_pos ] = ((bb->buf[ byte_pos ] & ~(1U<<bit_pos)) | ((bit<<bit_pos) & 0xff)) & 0xff;
+    bb->pos += 1;
+}
+
+static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
+    int i;
+    if (len>0) {
+        if (bb->log_symbols)
+            printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
+        for(i=0; i<len; i++) {
+            bitbuf_putbit(bb, (uint8_t)((data>>i)&1));
+        }
+    }
+}
+
+// Return new bitpos
+static int encode_slice( const int *w_value,
+                         const int *z_value,
+                         int nvalues,
+                         palette_t *p,
+                         int new_palette,
+                         int uncompressed_bits,
+                         int w_cfg,
+                         int z_cfg,
+                         uint8_t *bitbuf,
+                         int bitbuf_size,
+                         int bitpos,
+                         int verbose )
+{
+    int i,j;
+    bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+    bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
+    bb->pos = bitpos;
+
+    assert(nvalues<32768);
+    if (w_cfg < 0 || z_cfg < 0)
+        return bitpos;
+    // GRC parameters for this slice
+    int w_grc_div       = w_grc_params[w_cfg] & 15;
+    int w_grc_trunc     = (w_grc_params[w_cfg] >> 4)==1;
+    int w_uncompressed  = (w_grc_params[w_cfg] >> 4)==2;
+    int z_grc_div       = z_grc_params[z_cfg] & 15;
+
+    if (w_uncompressed) {
+        w_grc_div = uncompressed_bits;
+    }
+
+    int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
+    int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
+
+    if (verbose&1) {
+        printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
+                bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
+    }
+
+    // Write slice header
+    bitbuf_put( bb, "ZDIV", 3, zdiv);
+    bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
+    bitbuf_put( bb, "WDIV", 3, wdiv);
+    bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
+    bitbuf_put( bb, "NEWPAL", 1, new_palette );
+    if (new_palette) {
+        bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
+        bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
+        bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
+        for(i=0; i<p->palsize; i++) {
+            bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
+        }
+    }
+
+    int z_nvalues = nvalues + (new_palette?1:0);
+    int w_pos=0, z_pos=0;
+    int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
+    int z_unary=0, z_q=-1, z_r=0;
+    int w_nsymbols=0, w_remain[12]={0};
+    int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
+    int z_nsymbols=0, z_remain[12]={0};
+    int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
+    int z_unary_len = z_grc_div<3 ? 12 : 8;
+    do {
+        int balance = p->use_zero_runs ? w_pos - z_pos : 0;
+        int w_enable = balance<8 && w_pos<nvalues;
+        int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
+        if (w_enable) {
+            // Encode chunk (weights)
+            j=0;
+            w_nsymbols=0;
+            w_unary0=0;
+            w_unary1=0;
+            w_unary1_len=0;
+            int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
+            while(j<max_symbols) {
+                if (w_q<0) {
+                    if (w_pos<nvalues) {
+                        int value = w_value[w_pos];
+                        assert(value<512);
+                        w_q = value>>w_grc_div;
+                        w_r = value&((1<<w_grc_div)-1);
+                        assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
+                    } else {
+                        w_q = 0;
+                        w_r = -1;   // don't send remainder
+                    }
+                }
+                while( w_q>=0 && j<max_symbols) {
+                    w_unary0 |= w_q>0 ? (1<<j) : 0;
+                    if (w_q>0) {
+                        w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
+                        w_unary1_len++;
+                    }
+                    j++;
+                    w_q-=2;
+                    if (w_grc_trunc)
+                        w_q--;
+                }
+                if (w_q<0 && w_r>=0) {
+                    w_remain[w_nsymbols] = w_r;
+                    w_nsymbols++;
+                    w_pos++;
+                }
+            }
+        }
+
+        if (z_enable) {
+            // Encode chunk (zrun)
+            j=0;
+            z_nsymbols=0;
+            z_unary=0;
+            while(j<z_unary_len) {
+                if (z_q<0) {
+                    if (z_pos<z_nvalues) {
+                        int value = z_value[z_pos];
+                        z_q = value>>z_grc_div;
+                        z_r = value&((1<<z_grc_div)-1);
+                    } else {
+                        z_q = 0;
+                        z_r = -1;
+                    }
+                }
+                while( z_q>=0 && j<z_unary_len) {
+                    z_unary |= z_q>0 ? (1<<j) : 0;
+                    j++;
+                    z_q--;
+                }
+                if (z_q<0 && z_r>=0) {
+                    z_remain[z_nsymbols] = z_r;
+                    z_nsymbols++;
+                    z_pos++;
+                }
+            }
+        }
+
+        // Write chunk to bitstream
+        if (w_enable && !w_uncompressed) {
+            bitbuf_put( bb, "WUNARY0", 12, w_unary0);
+        }
+        if (z_enable) {
+            bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
+        }
+        if (w_enable && !w_uncompressed) {
+            bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
+        }
+        if (w_prev_enable) {
+            for(i=0; i<w_prev_nsymbols; i++) {
+                bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
+            }
+        }
+        if (z_prev_enable) {
+            for(i=0; i<z_prev_nsymbols; i++) {
+                bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
+            }
+        }
+        w_prev_enable = w_enable;
+        w_prev_nsymbols = w_nsymbols;
+        memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
+        z_prev_enable = z_enable;
+        z_prev_nsymbols = z_nsymbols;
+        memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
+    } while( w_prev_enable || z_prev_enable );
+
+    return bb->pos;
+}
+
+// return new bitpos
+static int encode_section( const int16_t *inbuf,
+                           int size,
+                           palette_t *p,
+                           uint8_t *bitbuf,
+                           int bitbuf_size,
+                           int bitpos,
+                           int verbose )
+{
+    int uncompressed_bits;
+
+    // Uncompressed mode can only be used if either all weights
+    // are in the palette OR if the palette is not used.
+    if (p->only_palette) {
+        // Uncompressed bits derived from palette size
+        uncompressed_bits=0;
+        while( (1<<uncompressed_bits) < p->palsize )
+            uncompressed_bits++;
+    } else if (p->palsize==0) {
+        // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
+        uncompressed_bits = p->palbits;
+    } else {
+        // Don't use uncompressed
+        uncompressed_bits = 100;
+    }
+
+    uint8_t *w_slice_cfg=0;
+    uint8_t *z_slice_cfg=0;
+    int *w_slice_pos=0;
+    int *z_slice_pos=0;
+    int *weight_values =0;
+    int *zrun_values = 0;
+    do {
+        CHECKED_MALLOC( weight_values, size*sizeof(int) );
+        CHECKED_MALLOC( zrun_values, size*sizeof(int) );
+
+        // Get weights (or weight indicies) AND zero-runs from the input weight stream.
+        int i=0, n_weights = 0, zcnt;
+        while(1) {
+            if (p->use_zero_runs) {
+                zcnt=0;
+                // Count zero run
+                // Special case: if all weights in the section are zero, we must
+                // still ensure we have one coded weight so the the slice length
+                // doesn't become 0. Therefore we skip the first zero run and code
+                // the zero explicitly as a weight value instead
+                if (!p->only_zeros || i>0) {
+                    while( i<size && inbuf[i]==0) {
+                        zcnt++;
+                        i++;
+                    }
+                }
+                zrun_values[n_weights] = zcnt;
+            }
+            if (i==size)
+                break;
+            int value = p->inv_lut[inbuf[i]+256];
+            weight_values[n_weights] = value;
+            n_weights++;
+            i++;
+        }
+
+        // Search for good GRC parameters for the weight stream
+        int n_w_slice, w_bitcnt;
+        CHECKED_MALLOC( w_slice_cfg, size );
+        CHECKED_MALLOC( w_slice_pos, size*sizeof(int) );
+        n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
+        if ( n_w_slice < 0 ) {  // Memory allocation failed
+            bitpos = -1;
+            break;
+        }
+        if (n_weights==0)
+            n_w_slice = 0;
+
+        // Search for good GRC parameters for the zrun stream
+        int n_z_slice=0, z_bitcnt=0;
+        if (p->use_zero_runs) {
+            CHECKED_MALLOC( z_slice_cfg, size );
+            CHECKED_MALLOC( z_slice_pos, size*sizeof(int) );
+            n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
+            if ( n_z_slice < 0 ) {  // Memory allocation failed
+                bitpos = -1;
+                break;
+            }
+        }
+
+        // Encode bitstream slice
+        int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
+        while(pos<n_weights || new_palette) {
+            int endpos=pos+32767;   // max slice length
+
+            if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
+                endpos = w_slice_pos[i_w_slice];
+            }
+
+            if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
+                endpos = z_slice_pos[i_z_slice];
+            }
+
+            if (n_weights < endpos) {
+                endpos = n_weights;
+            }
+
+            // The first slice (when new_palette is 1) encodes zero runs both at the
+            // beginning and end (i.e. number of zero runs are len+1).
+            // The following slices only encode zero runs at the end (there cannot be
+            // any zeros in the beginning since they are encoded by the previous slice)
+            int len = endpos - pos;
+            int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
+            bitpos = encode_slice( weight_values+pos, zrun_buf, len,
+                                p, new_palette, uncompressed_bits,
+                                w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
+                                bitbuf, bitbuf_size, bitpos, verbose );
+            new_palette = 0;
+
+            if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
+                i_w_slice++;
+            }
+            if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
+                i_z_slice++;
+            }
+            pos = endpos;
+        }
+    } while(false);
+
+    // Free temporary buffers
+    free(w_slice_cfg);
+    free(w_slice_pos);
+    if (p->use_zero_runs) {
+        free(z_slice_cfg);
+        free(z_slice_pos);
+    }
+    free(weight_values);
+    free(zrun_values);
+
+    return bitpos;
+}
+
+// Encode the given weight stream
+//      inbuf       uncompressed 9bit signed weights
+//      inbuf_size  number of weights
+//      outbuf      compressed bitstream, buffer is malloced within this function
+//      verbose     if non-zero, printf log
+// Return value is the size in bytes of the compressed output
+// Return -1 if error
+int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
+    int i;
+    // Range check
+    for(i=0; i<inbuf_size; i++) {
+        //printf("index %d, weight value is %d\n", i, inbuf[i] + 130);
+
+        if (inbuf[i]<-255 || inbuf[i]>255) {
+            printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
+            return -1;
+        }
+    }
+
+    int bitbuf_size = inbuf_size*2+1024;
+    assert(*outbuf == NULL);
+    *outbuf = malloc( bitbuf_size );
+    if (!*outbuf)
+    {  // Failed to allocate buffer
+        return -1;
+    }
+
+    // Analyse input data to find palette re-programming points
+    int *palette_restart_pos = NULL;
+    int n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
+
+    // Compress each section (using a single palette) separately
+    int bitpos = 0;
+    for ( i = 0; i < n_restarts && bitpos >= 0; i++ ) {
+        palette_t palette;
+        int pos, size;
+        pos = palette_restart_pos[i];
+        size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
+        find_palette( inbuf+pos, size, &palette);
+        create_inverse_palette( &palette);
+        bitpos = encode_section( inbuf+pos, size, &palette,
+                                 *outbuf, bitbuf_size, bitpos, verbose );
+    }
+
+    int ret = -1;
+    if ( bitpos >= 0 && n_restarts >= 0 ) {  // If allocation fails bitpos or n_restarts < 0
+        // Add end of stream marker and align to 128bit
+        bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+        bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
+        bb->pos = bitpos;
+        bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
+        bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
+
+        // Pad with 0xff until 64bit aligned
+        while( bb->pos & 127 ) {
+          bitbuf_put( bb, "PAD", 8, 0xff );
+        }
+        bitpos = bb->pos;
+
+        assert((bitpos&127)==0);
+        int outbuf_size = bitpos/8;
+        *outbuf = realloc(*outbuf, outbuf_size);
+        if ( *outbuf ) {
+            ret = outbuf_size;
+        }
+    }
+
+    free(palette_restart_pos);
+
+    return ret;
+}
+
+void mlw_free_outbuf( uint8_t *outbuf ) {
+    if (outbuf)
+        free(outbuf);
+}
+
+struct brick_buf_s
+{
+    int16_t* buf;
+    int* strides;
+};
+typedef struct brick_buf_s brick_buf_t;
+
+static int16_t get_brick_weight(brick_buf_t* buf, int ofm_z, int wy, int wx, int ifm_z)
+{
+    int16_t* p = buf->buf;
+
+    p += ofm_z * buf->strides[0];
+    p += wy * buf->strides[1];
+    p += wx * buf->strides[2];
+    p += ifm_z * buf->strides[3];
+
+    return *p;
+}
+
+static void reorder_free(int16_t* buf)
+{
+    if (buf)
+    {
+        free(buf);
+    }
+}
+
+static int16_t* reorder(
+    int ifm_ublock_depth,
+    int ofm_ublock_depth,
+    int ofm_depth,
+    int kernel_height,
+    int kernel_width,
+    int ifm_depth,
+    int* strides,
+    int16_t* inbuf,
+    int ofm_block_depth,
+    int is_depthwise,
+    int is_partkernel,
+    int ifm_bitdepth,
+    int decomp_h,
+    int decomp_w,
+    int64_t* padded_length)
+{
+    *padded_length = -1;
+    /* Size unknown. Start with one page at least */
+    int64_t length = round_up(max(1, sizeof(int16_t)*
+        ofm_depth*
+        kernel_height*
+        kernel_width*
+        ifm_depth),
+    4*1024) / sizeof(int16_t);
+    int16_t* weights = (int16_t*)malloc(length * sizeof(int16_t));
+    if (!weights)
+    { // Alloc failed, so exit
+        return NULL;
+    }
+
+    brick_buf_t brick_buf;
+    brick_buf.buf = inbuf;
+    brick_buf.strides = strides;
+
+    int ifm_block_depth = is_partkernel || ifm_bitdepth == 16 ? 16 : 32;
+    int64_t weight_cnt = 0;
+    for (int ofm_block_z = 0; ofm_block_z < ofm_depth; ofm_block_z += ofm_block_depth)
+    {
+        int clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z);
+        // IFM blocks required for the brick
+        for (int ifm_block_z = 0; ifm_block_z < (is_depthwise ? 1 : ifm_depth); ifm_block_z += ifm_block_depth)
+        {
+            int clipped_ifm_block_depth;
+            if (is_depthwise)
+            {
+                clipped_ifm_block_depth = ifm_ublock_depth;
+            }
+            else
+            {
+                clipped_ifm_block_depth = is_partkernel ?
+                    min(ifm_block_depth, ifm_depth - ifm_block_z) : ifm_block_depth;
+            }
+            // Weight decomposition
+            // Subkernel Splitting  (H)
+            for (int subkernel_y = 0; subkernel_y < kernel_height; subkernel_y += decomp_h)
+            {
+                int sub_height = min(kernel_height - subkernel_y, decomp_h);
+                // Subkernel splitting (W)
+                for (int subkernel_x = 0; subkernel_x < kernel_width; subkernel_x += decomp_w)
+                {
+                    int sub_width = min(kernel_width - subkernel_x, decomp_w);
+                    int subkernel_elements = sub_width * sub_height;
+                    // Part kernel first works across the kernel H/W and needs padding
+                    if (is_partkernel)
+                    {
+                        if (ifm_bitdepth == 16 && subkernel_elements % 2 != 0)
+                        {
+                            subkernel_elements = round_up(subkernel_elements, 2);
+                        }
+                        else if (ifm_bitdepth == 8 && subkernel_elements % 4 != 0)
+                        {
+                            subkernel_elements = round_up(subkernel_elements, 4);
+                        }
+                    }
+                    else if (is_depthwise)
+                    {
+                        subkernel_elements = round_up(subkernel_elements, 4);
+                    }
+                    int ifm_block_depth_outer = is_partkernel ? clipped_ifm_block_depth : 1;
+                    int ifm_block_depth_inner = is_partkernel ? 1 : clipped_ifm_block_depth;
+                    for (int ifm_ublk_outer = 0; ifm_ublk_outer < ifm_block_depth_outer; ifm_ublk_outer += ifm_ublock_depth)
+                    {
+                        // OFM Ublocks in OFM-block over depth
+                        for (int ofm_ublk = 0; ofm_ublk < clipped_ofm_block_depth; ofm_ublk += ofm_ublock_depth)
+                        {
+                            // HW Kernel element traversal - cannot be a H/W loop due to element
+                            // padding requirement on depthwise/part-kernel configurations
+                            for (int element = 0; element < subkernel_elements; element++)
+                            {
+                                int kx = element % sub_width;
+                                int ky = element / sub_width;
+                                // IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)
+                                // In case of part-kernel-first IFM Ublock traversal have already been handled
+                                // and this loop is ignored.
+                                for (int ifm_ublk_inner = 0; ifm_ublk_inner < ifm_block_depth_inner; ifm_ublk_inner += ifm_ublock_depth)
+                                {
+                                    // Feed OFM ublock elements
+                                    for (int ofm_ublock_z = 0; ofm_ublock_z < ofm_ublock_depth; ofm_ublock_z++)
+                                    {
+                                        // Source IFM ublock elements (only 1 element deep if depthwise)
+                                        for (int ifm_ublock_z = 0; ifm_ublock_z < (is_depthwise ? 1 : ifm_ublock_depth); ifm_ublock_z++)
+                                        {
+                                            // Source position within the current subkernel
+                                            int wx = subkernel_x + kx;
+                                            int wy = subkernel_y + ky;
+                                            // Source IFM/OFM slices
+                                            int ifm_ublk = ifm_ublk_inner + ifm_ublk_outer;
+                                            int ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z;
+                                            int ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z;
+                                            if ((ifm_z < ifm_depth) && (ofm_z < ofm_depth) && (ky < sub_height))
+                                            {
+                                                weights[weight_cnt] = get_brick_weight(&brick_buf, ofm_z, wy, wx, ifm_z);
+                                                //fprintf(stderr, "weights[%ld] %d ofm_z %d wy %d wx %d ifm_z %d\n", weight_cnt, weights[weight_cnt], ofm_z, wy, wx, ifm_z);
+                                            }
+                                            else
+                                            {
+                                                weights[weight_cnt] = 0;
+                                            }
+                                            weight_cnt++;
+                                            if (weight_cnt == length)
+                                            {
+                                                // Reallocate by doubling the buffer size as needed
+                                                length *= 2;
+                                                weights = (int16_t*)realloc(weights, length * sizeof(int16_t));
+                                                if (!weights)
+                                                { // Realloc failed, so exit
+                                                    return NULL;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+    weights = (int16_t*)realloc(weights, weight_cnt * sizeof(int16_t));
+    if ( weights ) {
+        *padded_length = weight_cnt;
+    }
+
+    return weights;
+}
+
+// Reorder and encode the given weight stream
+// Return value is the size in bytes of the compressed output
+// Return -1 if error
+int mlw_reorder_encode(
+    int ifm_ublock_depth,
+    int ofm_ublock_depth,
+    int ofm_depth,
+    int kernel_height,
+    int kernel_width,
+    int ifm_depth,
+    int* brick_strides,
+    int16_t* inbuf,
+    int ofm_block_depth,
+    int is_depthwise,
+    int is_partkernel,
+    int ifm_bitdepth,
+    int decomp_h,
+    int decomp_w,
+    uint8_t **outbuf, // *outbuf must be freed by caller
+    int64_t* padded_length,
+    int verbose)
+{
+   if (verbose) {
+      fprintf(stderr, "mlw_reorder_encode: %d %d %d %d %d %d (%d %d %d %d) %d %d %d %d %d %d\n", ifm_ublock_depth,
+                                                                  ofm_ublock_depth,
+                                                                  ofm_depth,
+                                                                  kernel_height,
+                                                                  kernel_width,
+                                                                  ifm_depth,
+                                                                  brick_strides[0],
+                                                                  brick_strides[1],
+                                                                  brick_strides[2],
+                                                                  brick_strides[3],
+                                                                  ofm_block_depth,
+                                                                  is_depthwise,
+                                                                  is_partkernel,
+                                                                  ifm_bitdepth,
+                                                                  decomp_h,
+                                                                  decomp_w);
+   }
+    /* Reorder weights */
+    int16_t* weights = reorder(
+        ifm_ublock_depth,
+        ofm_ublock_depth,
+        ofm_depth,
+        kernel_height,
+        kernel_width,
+        ifm_depth,
+        brick_strides,
+        inbuf,
+        ofm_block_depth,
+        is_depthwise,
+        is_partkernel,
+        ifm_bitdepth,
+        decomp_h,
+        decomp_w,
+        padded_length);
+
+    /* Then encode */
+    int output_length = -1;
+    if (*padded_length > 0 && *padded_length <= INT32_MAX)
+    {
+        output_length = mlw_encode(weights, (int)*padded_length, outbuf, verbose);
+    }
+    reorder_free(weights);
+
+    return output_length;
+}
diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h
new file mode 100644
index 00000000000..3162031e69d
--- /dev/null
+++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef MLW_ENCODE_H
+#define MLW_ENCODE_H
+
+#ifdef _MSC_VER
+  #define MLW_ENCODE_EXPORTED __declspec(dllexport)
+#else
+  #define MLW_ENCODE_EXPORTED __attribute__((visibility("default")))
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+MLW_ENCODE_EXPORTED
+int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose);
+
+MLW_ENCODE_EXPORTED
+void mlw_free_outbuf(uint8_t *outbuf);
+
+MLW_ENCODE_EXPORTED
+int mlw_reorder_encode(
+    int ifm_ublock_depth,
+    int ofm_ublock_depth,
+    int ofm_depth,
+    int kernel_height,
+    int kernel_width,
+    int ifm_depth,
+    int* brick_strides,
+    int16_t* inbuf,
+    int ofm_block_depth,
+    int is_depthwise,
+    int is_partkernel,
+    int ifm_bitdepth,
+    int decomp_h,
+    int decomp_w,
+    uint8_t **outbuf,
+    int64_t* padded_length,
+    int verbose);
+
+#if __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/ethosu/registers.xml b/src/gallium/drivers/ethosu/registers.xml
new file mode 100644
index 00000000000..961accc44ed
--- /dev/null
+++ b/src/gallium/drivers/ethosu/registers.xml
@@ -0,0 +1,399 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<database xmlns="http://nouveau.freedesktop.org/"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
+
+<copyright year="2024">
+
+<author name="Tomeu Vizoso" email="tomeu@tomeuvizoso.net"><nick name="tomeu"/>
+Initial Author.
+</author>
+
+<license>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+</license>
+
+</copyright>
+
+<domain name="CMD0" width="32">
+   <reg32 offset="0x0" name="NPU_OP_STOP">
+      <bitfield name="mask" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x1" name="NPU_OP_IRQ">
+      <bitfield name="mask" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x2" name="NPU_OP_CONV">
+      <bitfield name="reserved0" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x3" name="NPU_OP_DEPTHWISE">
+      <bitfield name="reserved0" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x5" name="NPU_OP_POOL">
+      <bitfield name="mode" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x6" name="NPU_OP_ELEMENTWISE">
+      <bitfield name="mode" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10" name="NPU_OP_DMA_START">
+      <bitfield name="channel_mode" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11" name="NPU_OP_DMA_WAIT">
+      <bitfield name="reserved0" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x12" name="NPU_OP_KERNEL_WAIT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x13" name="NPU_OP_PMU_MASK">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x100" name="NPU_SET_IFM_PAD_TOP">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x101" name="NPU_SET_IFM_PAD_LEFT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x102" name="NPU_SET_IFM_PAD_RIGHT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x103" name="NPU_SET_IFM_PAD_BOTTOM">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x104" name="NPU_SET_IFM_DEPTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x105" name="NPU_SET_IFM_PRECISION">
+      <bitfield name="round_mode" low="14" high="15" type="uint"/>
+      <bitfield name="reserved2" low="10" high="13" type="uint"/>
+      <bitfield name="scale_mode" low="8" high="9" type="uint"/>
+      <bitfield name="format" low="6" high="7" type="uint"/>
+      <bitfield name="reserved1" low="4" high="5" type="uint"/>
+      <bitfield name="precision" low="2" high="3" type="uint"/>
+      <bitfield name="reserved0" low="1" high="1" type="uint"/>
+      <bitfield name="activation" low="0" high="0" type="uint"/>
+   </reg32>
+   <reg32 offset="0x107" name="NPU_SET_IFM_UPSCALE">
+      <bitfield name="reserved0" low="2" high="15" type="uint"/>
+      <bitfield name="mode" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x109" name="NPU_SET_IFM_ZERO_POINT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10a" name="NPU_SET_IFM_WIDTH0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10b" name="NPU_SET_IFM_HEIGHT0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10c" name="NPU_SET_IFM_HEIGHT1_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10d" name="NPU_SET_IFM_IB_END">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10f" name="NPU_SET_IFM_REGION">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x111" name="NPU_SET_OFM_WIDTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x112" name="NPU_SET_OFM_HEIGHT_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x113" name="NPU_SET_OFM_DEPTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x114" name="NPU_SET_OFM_PRECISION">
+      <bitfield name="round_mode" low="14" high="15" type="uint"/>
+      <bitfield name="reserved1" low="9" high="13" type="uint"/>
+      <bitfield name="scale_mode" low="8" high="8" type="uint"/>
+      <bitfield name="format" low="6" high="7" type="uint"/>
+      <bitfield name="reserved0" low="3" high="5" type="uint"/>
+      <bitfield name="precision" low="1" high="2" type="uint"/>
+      <bitfield name="activation" low="0" high="0" type="uint"/>
+   </reg32>
+   <reg32 offset="0x115" name="NPU_SET_OFM_BLK_WIDTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x116" name="NPU_SET_OFM_BLK_HEIGHT_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x117" name="NPU_SET_OFM_BLK_DEPTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x118" name="NPU_SET_OFM_ZERO_POINT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11a" name="NPU_SET_OFM_WIDTH0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11b" name="NPU_SET_OFM_HEIGHT0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11c" name="NPU_SET_OFM_HEIGHT1_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11f" name="NPU_SET_OFM_REGION">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x120" name="NPU_SET_KERNEL_WIDTH_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x121" name="NPU_SET_KERNEL_HEIGHT_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x122" name="NPU_SET_KERNEL_STRIDE">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x123" name="NPU_SET_PARALLEL_MODE">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x124" name="NPU_SET_ACC_FORMAT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x125" name="NPU_SET_ACTIVATION">
+      <bitfield name="act_clip_range" low="12" high="15" type="uint"/>
+      <bitfield name="type" low="0" high="11" type="uint"/>
+   </reg32>
+   <reg32 offset="0x126" name="NPU_SET_ACTIVATION_MIN">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x127" name="NPU_SET_ACTIVATION_MAX">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x128" name="NPU_SET_WEIGHT_REGION">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x129" name="NPU_SET_SCALE_REGION">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x12d" name="NPU_SET_AB_START">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x12f" name="NPU_SET_BLOCKDEP">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x130" name="NPU_SET_DMA0_SRC_REGION">
+      <bitfield name="reserved0" low="11" high="15" type="uint"/>
+      <bitfield name="stride_mode" low="9" high="10" type="uint"/>
+      <bitfield name="internal" low="8" high="8" type="uint"/>
+      <bitfield name="region" low="0" high="7" type="uint"/>
+   </reg32>
+   <reg32 offset="0x131" name="NPU_SET_DMA0_DST_REGION">
+      <bitfield name="reserved0" low="11" high="15" type="uint"/>
+      <bitfield name="stride_mode" low="9" high="10" type="uint"/>
+      <bitfield name="internal" low="8" high="8" type="uint"/>
+      <bitfield name="region" low="0" high="7" type="uint"/>
+   </reg32>
+   <reg32 offset="0x132" name="NPU_SET_DMA0_SIZE0">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x133" name="NPU_SET_DMA0_SIZE1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x180" name="NPU_SET_IFM2_BROADCAST">
+      <bitfield name="reserved1" low="8" high="15" type="uint"/>
+      <bitfield name="broadcast_scalar" low="7" high="7" type="uint"/>
+      <bitfield name="operand_order" low="6" high="6" type="uint"/>
+      <bitfield name="reserved0" low="3" high="5" type="uint"/>
+      <bitfield name="broadcast_depth" low="2" high="2" type="uint"/>
+      <bitfield name="broadcast_width" low="1" high="1" type="uint"/>
+      <bitfield name="broadcast_height" low="0" high="0" type="uint"/>
+   </reg32>
+   <reg32 offset="0x181" name="NPU_SET_IFM2_SCALAR">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x185" name="NPU_SET_IFM2_PRECISION">
+      <bitfield name="reserved1" low="8" high="15" type="uint"/>
+      <bitfield name="format" low="6" high="7" type="uint"/>
+      <bitfield name="reserved0" low="4" high="5" type="uint"/>
+      <bitfield name="precision" low="0" high="3" type="uint"/>
+   </reg32>
+   <reg32 offset="0x189" name="NPU_SET_IFM2_ZERO_POINT">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x18a" name="NPU_SET_IFM2_WIDTH0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x18b" name="NPU_SET_IFM2_HEIGHT0_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x18c" name="NPU_SET_IFM2_HEIGHT1_M1">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x18d" name="NPU_SET_IFM2_IB_START">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+   <reg32 offset="0x18f" name="NPU_SET_IFM2_REGION">
+      <bitfield name="param" low="0" high="15" type="uint"/>
+   </reg32>
+</domain>
+<domain name="CMD1" width="32">
+   <reg32 offset="0x0" name="NPU_SET_IFM_BASE0">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x1" name="NPU_SET_IFM_BASE1">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x2" name="NPU_SET_IFM_BASE2">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x3" name="NPU_SET_IFM_BASE3">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x4" name="NPU_SET_IFM_STRIDE_X">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x5" name="NPU_SET_IFM_STRIDE_Y">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x6" name="NPU_SET_IFM_STRIDE_C">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x10" name="NPU_SET_OFM_BASE0">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x11" name="NPU_SET_OFM_BASE1">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x12" name="NPU_SET_OFM_BASE2">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x13" name="NPU_SET_OFM_BASE3">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x14" name="NPU_SET_OFM_STRIDE_X">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x15" name="NPU_SET_OFM_STRIDE_Y">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x16" name="NPU_SET_OFM_STRIDE_C">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x20" name="NPU_SET_WEIGHT_BASE">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x21" name="NPU_SET_WEIGHT_LENGTH">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x22" name="NPU_SET_SCALE_BASE">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x23" name="NPU_SET_SCALE_LENGTH">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x24" name="NPU_SET_OFM_SCALE">
+      <bitfield name="shift" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x25" name="NPU_SET_OPA_SCALE">
+      <bitfield name="shift" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x26" name="NPU_SET_OPB_SCALE">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x30" name="NPU_SET_DMA0_SRC">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x31" name="NPU_SET_DMA0_DST">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x32" name="NPU_SET_DMA0_LEN">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x33" name="NPU_SET_DMA0_SKIP0">
+      <bitfield name="param" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x34" name="NPU_SET_DMA0_SKIP1">
+      <bitfield name="param" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x80" name="NPU_SET_IFM2_BASE0">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x81" name="NPU_SET_IFM2_BASE1">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x82" name="NPU_SET_IFM2_BASE2">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x83" name="NPU_SET_IFM2_BASE3">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x84" name="NPU_SET_IFM2_STRIDE_X">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x85" name="NPU_SET_IFM2_STRIDE_Y">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x86" name="NPU_SET_IFM2_STRIDE_C">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x90" name="NPU_SET_WEIGHT1_BASE">
+      <bitfield name="param" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x91" name="NPU_SET_WEIGHT1_LENGTH">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x92" name="NPU_SET_SCALE1_BASE">
+      <bitfield name="param" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+   <reg32 offset="0x93" name="NPU_SET_SCALE1_LENGTH">
+      <bitfield name="reserved0" low="2" high="17" type="uint"/>
+      <bitfield name="payload_size" low="0" high="1" type="uint"/>
+   </reg32>
+</domain>
+
+</database>
diff --git a/src/gallium/drivers/ethosu/rules-ng.xsd b/src/gallium/drivers/ethosu/rules-ng.xsd
new file mode 100644
index 00000000000..414dee1d746
--- /dev/null
+++ b/src/gallium/drivers/ethosu/rules-ng.xsd
@@ -0,0 +1,457 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<schema xmlns="http://www.w3.org/2001/XMLSchema"
+	targetNamespace="http://nouveau.freedesktop.org/"
+	xmlns:rng="http://nouveau.freedesktop.org/"
+	elementFormDefault="qualified">
+
+	<annotation>
+		<documentation>
+			An updated version of the old rules.xml file from the
+			RivaTV project. Specifications by Pekka Paalanen,
+			preliminary attempt by KoalaBR,
+			first working version by Jakob Bornecrantz.
+			For specifications, see the file rules-ng-format.txt
+			in Nouveau CVS module 'rules-ng'.
+		</documentation>
+		<documentation>Version 0.1</documentation>
+	</annotation>
+
+
+	<!-- Elements -->
+
+	<element name="database"       type="rng:databaseType" />
+	<element name="import"         type="rng:importType" />
+	<element name="copyright"      type="rng:copyrightType" />
+	<element name="domain"         type="rng:domainType" />
+	<element name="group"          type="rng:groupType" />
+	<element name="use-group"      type="rng:refType" />
+	<element name="array"          type="rng:arrayType" />
+	<element name="stripe"         type="rng:stripeType" />
+	<element name="reg64"          type="rng:registerType" />
+	<element name="reg32"          type="rng:registerType" />
+	<element name="reg16"          type="rng:registerType" />
+	<element name="reg8"           type="rng:registerType" />
+	<element name="bitset"         type="rng:bitsetType" />
+	<element name="bitfield"       type="rng:bitfieldType" />
+	<element name="enum"           type="rng:enumType" />
+	<element name="value"          type="rng:valueType" />
+
+	<!-- Copyright elements -->
+	<element name="author"         type="rng:authorType" />
+	<element name="nick"           type="rng:nickType" />
+	<element name="license"        type="rng:docType" />
+
+	<!-- Documentation elements -->
+	
+	<!-- FIXME: allowed only one  per parent element -->
+	<element name="brief" type="rng:briefType" />
+	
+	<element name="doc"  type="rng:docType" />
+	<element name="b"    type="rng:textformatType" />
+	<element name="i"    type="rng:textformatType" />
+	<element name="u"    type="rng:textformatType" />
+	<element name="code" type="rng:textcodeType" />
+	<element name="ul"   type="rng:listType" />
+	<element name="ol"   type="rng:listType" />
+	<element name="li"   type="rng:listitemType" />
+
+	<!-- Copyright element types -->
+
+	<complexType name="authorType" mixed="true">
+		<annotation>
+			<documentation>
+				register database author
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:nick" />
+		</choice>
+		<attribute name="name" type="string" use="required" />
+		<attribute name="email" type="string" use="required" />
+	</complexType>
+
+	<complexType name="nickType">
+		<annotation>
+			<documentation>nickType</documentation>
+		</annotation>
+		<attribute name="name" type="string" use="required" />
+	</complexType>
+
+	<!-- Database element types -->
+
+	<complexType name="databaseType">
+		<annotation>
+			<documentation>databaseType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+	</complexType>
+
+	<complexType name="importType">
+		<annotation>
+			<documentation>importType</documentation>
+		</annotation>
+		<attribute name="file" type="string" use="required" />
+	</complexType>
+
+	<complexType name="copyrightType">
+		<annotation>
+			<documentation>copyrightType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<element ref="rng:author" />
+			<element ref="rng:license" />
+		</choice>
+		<attribute name="year" type="nonNegativeInteger" use="optional" />
+	</complexType>
+
+	<complexType name="domainType">
+		<annotation>
+			<documentation>domainType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="width" type="rng:DomainWidth" use="optional" />
+		<attribute name="size" type="rng:HexOrNumber" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="groupType">
+		<annotation>
+			<documentation>groupType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+	</complexType>
+
+	<complexType name="arrayType">
+		<annotation>
+			<documentation>arrayType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="optional" />
+		<attribute name="offset" type="rng:HexOrNumber" use="optional" />
+		<attribute name="offsets" type="string" use="optional"/>
+		<attribute name="doffsets" type="string" use="optional"/>
+		<attribute name="index" type="NMTOKENS" use="optional"/>
+		<attribute name="stride" type="rng:HexOrNumber" use="required" />
+		<attribute name="length" type="rng:HexOrNumber" use="required" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="usage" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="stripeType">
+		<annotation>
+			<documentation>stripeType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" minOccurs="0" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="optional" />
+		<attribute name="offset" type="rng:HexOrNumber" use="optional" />
+		<attribute name="stride" type="rng:HexOrNumber" use="optional" />
+		<attribute name="length" type="rng:HexOrNumber" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+	</complexType>
+
+	<complexType name="registerType">
+		<annotation>
+			<documentation>
+				registerType used by reg8, reg16, reg32, reg64
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<element ref="rng:value" />
+			<element ref="rng:bitfield" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="offset" type="rng:HexOrNumber" use="required" />
+		<attribute name="access" type="rng:Access" default="rw" use="optional" />
+		<attribute name="type" type="NMTOKENS" use="optional" />
+		<attribute name="shr" type="nonNegativeInteger" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="stride" type="rng:HexOrNumber" use="optional" />
+		<attribute name="length" type="rng:HexOrNumber" use="optional" />
+		<attribute name="high" type="nonNegativeInteger" use="optional" />
+		<attribute name="low" type="nonNegativeInteger" use="optional" />
+		<attribute name="pos" type="nonNegativeInteger" use="optional" />
+		<attribute name="align" type="nonNegativeInteger" use="optional" />
+		<attribute name="radix" type="nonNegativeInteger" use="optional" />
+		<attribute name="usage" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="bitsetType">
+		<annotation>
+			<documentation>bitsetType</documentation>
+		</annotation>
+		<choice maxOccurs="unbounded">
+			<element ref="rng:bitfield" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="inline" type="rng:Boolean" use="optional" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+	</complexType>
+
+	<complexType name="bitfieldType">
+		<annotation>
+			<documentation>bitfieldType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:value" maxOccurs="unbounded" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="high" type="nonNegativeInteger" use="optional" />
+		<attribute name="low" type="nonNegativeInteger" use="optional" />
+		<attribute name="pos" type="nonNegativeInteger" use="optional" />
+		<attribute name="radix" type="nonNegativeInteger" use="optional" />
+		<attribute name="align" type="nonNegativeInteger" use="optional" />
+		<attribute name="type" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="addvariant" type="rng:Boolean" use="optional" />
+		<attribute name="shr" type="nonNegativeInteger" use="optional" />
+	</complexType>
+
+	<complexType name="enumType">
+		<annotation>
+			<documentation>enumType</documentation>
+		</annotation>
+		<choice maxOccurs="unbounded">
+			<element ref="rng:value" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="inline" type="rng:Boolean" use="optional" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+	</complexType>
+
+	<complexType name="valueType">
+		<annotation>
+			<documentation>valueType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="value" type="string" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="refType">
+		<annotation>
+			<documentation>refType</documentation>
+		</annotation>
+		<attribute name="ref" type="NMTOKEN" use="required" />
+	</complexType>
+
+
+	<!-- Documentation element types -->
+
+	<complexType name="briefType">
+		<annotation>
+			<documentation>
+				brief documentation, no markup
+			</documentation>
+		</annotation>
+		<simpleContent>
+			<extension base="string" />
+		</simpleContent>
+	</complexType>
+	
+	<complexType name="docType" mixed="true">
+		<annotation>
+			<documentation>
+				root element of documentation sub-tree
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+			<group ref="rng:listGroup" />
+			<element ref="rng:code" />
+		</choice>
+	</complexType>
+	
+	<complexType name="textformatType" mixed="true">
+		<annotation>
+			<documentation>
+				for bold, underline, italics
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+		</choice>
+	</complexType>
+	
+	<complexType name="textcodeType">
+		<simpleContent>
+			<extension base="string">
+				<attribute name="title" type="string" />
+			</extension>
+		</simpleContent>
+	</complexType>
+	
+	<complexType name="listType">
+		<annotation>
+			<documentation>
+				definition of a list, ordered or unordered
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:li" />
+		</choice>
+	</complexType>
+	
+	<complexType name="listitemType" mixed="true">
+		<annotation>
+			<documentation>
+				items of a list
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+			<group ref="rng:listGroup" />
+			<element ref="rng:code" />
+		</choice>
+	</complexType>
+
+
+
+	<!-- Attribute value types -->
+
+	<simpleType name="Hexadecimal">
+		<restriction base="string">
+			<pattern value="0x[0-9a-f]+" />
+			<pattern value="0x[0-9A-F]+" />
+			<pattern value="[0-9]" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="HexOrNumber">
+		<annotation>
+			<documentation>HexOrNumber</documentation>
+		</annotation>
+		<union memberTypes="rng:Hexadecimal nonNegativeInteger" />
+	</simpleType>
+
+	<simpleType name="Boolean">
+		<restriction base="string">
+			<enumeration value="true" />
+			<enumeration value="1" />
+			<enumeration value="yes" />
+			<enumeration value="false" />
+			<enumeration value="0" />
+			<enumeration value="no" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="Access">
+		<annotation>
+			<documentation>Access</documentation>
+		</annotation>
+		<restriction base="string">
+			<enumeration value="r" />
+			<enumeration value="w" />
+			<enumeration value="rw" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="DomainWidth">
+		<annotation>
+			<documentation>DomainWidth</documentation>
+		</annotation>
+		<restriction base="string">
+			<enumeration value="8" />
+			<enumeration value="16" />
+			<enumeration value="32" />
+			<enumeration value="64" />
+		</restriction>
+	</simpleType>
+
+
+
+	<!-- Element groups -->
+
+	<group name="topGroup">
+		<choice>
+			<element ref="rng:copyright" />
+			<element ref="rng:domain" />
+			<element ref="rng:enum" />
+			<element ref="rng:group" />
+			<element ref="rng:bitset" />
+			<element ref="rng:import" />
+		</choice>
+	</group>
+	
+	<group name="regarrayGroup">
+		<choice>
+			<element ref="rng:reg64" />
+			<element ref="rng:reg32" />
+			<element ref="rng:reg16" />
+			<element ref="rng:reg8" />
+			<element ref="rng:array" />
+			<element ref="rng:stripe" />
+			<element ref="rng:use-group" />
+		</choice>
+	</group>
+	
+	<group name="docGroup">
+		<choice>
+			<element ref="rng:brief" />
+			<element ref="rng:doc" />
+		</choice>
+	</group>
+	
+	<group name="textformatGroup">
+		<choice>
+			<element ref="rng:b" />
+			<element ref="rng:i" />
+			<element ref="rng:u" />
+		</choice>
+	</group>
+	
+	<group name="listGroup">
+		<choice>
+			<element ref="rng:ul" />
+			<element ref="rng:ol" />
+		</choice>
+	</group>
+
+</schema>
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index fbbceb2d45f..9b02fd1189a 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -190,6 +190,12 @@ if with_gallium_rocket
 else
   driver_rocket = declare_dependency()
 endif
+if with_gallium_ethosu
+  subdir('winsys/ethosu/drm')
+  subdir('drivers/ethosu')
+else
+  driver_ethosu = declare_dependency()
+endif
 if with_gallium_zink
   subdir('drivers/zink')
 else
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 97f641c4e98..134d7e4adf8 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -59,7 +59,7 @@ libgallium_dri = shared_library(
     driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
     driver_tegra, driver_i915, driver_svga, driver_virgl,
     driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
-    driver_asahi, driver_crocus, driver_rocket
+    driver_asahi, driver_crocus, driver_rocket, driver_ethosu
   ],
   install : true,
   name_suffix : libname_suffix,
diff --git a/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h b/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h
new file mode 100644
index 00000000000..8d45a0c2322
--- /dev/null
+++ b/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __ETHOSU_DRM_PUBLIC_H__
+#define __ETHOSU_DRM_PUBLIC_H__
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+ethosu_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
+
+#endif /* __ETHOSU_DRM_PUBLIC_H__ */
diff --git a/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c b/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c
new file mode 100644
index 00000000000..33e1e870f6e
--- /dev/null
+++ b/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/os_file.h"
+#include "util/u_screen.h"
+
+#include "ethosu/ethosu_device.h"
+#include "ethosu_drm_public.h"
+
+struct pipe_screen *
+ethosu_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+   return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
+                                         ethosu_screen_create);
+}
diff --git a/src/gallium/winsys/ethosu/drm/meson.build b/src/gallium/winsys/ethosu/drm/meson.build
new file mode 100644
index 00000000000..f9fa8ea5d73
--- /dev/null
+++ b/src/gallium/winsys/ethosu/drm/meson.build
@@ -0,0 +1,13 @@
+# Copyright 2017 Broadcom
+# SPDX-License-Identifier: MIT
+
+libethosuwinsys = static_library(
+  'ethosuwinsys',
+  files('ethosu_drm_winsys.c'),
+  include_directories : [
+    inc_src, inc_include,
+    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+  ],
+  gnu_symbol_visibility : 'hidden',
+  dependencies: [dep_libdrm, idep_mesautil],
+)