rocket: Initial commit of a driver for Rockchip's NPU

The programming model matches very closely to that of NVIDIA's NVDLA. Enough is implemented to run SSDLite MobileDet with roughly the same performance as the blob (when running on a single NPU core). Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29698>
2026-02-03 15:10:37 +01:00 · 2025-02-23 14:26:01 +01:00 · 2025-02-23 14:26:01 +01:00 · 5b829658f7
commit 5b829658f7
parent 41eee4c3cc
33 changed files with 6064 additions and 7 deletions
--- a/.clang-format-include
+++ b/.clang-format-include
@ -3,6 +3,7 @@

 src/gallium/drivers/i915
 src/gallium/drivers/r300/compiler/*
+src/gallium/drivers/rocket/**/*
 src/gallium/targets/teflon/**/*
 src/gallium/frontends/teflon/**/*
 src/amd/vulkan/**/*
--- a/docs/teflon.rst
+++ b/docs/teflon.rst
@ -15,6 +15,9 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate
   * - Etnaviv
     - ``VeriSilicon VIPNano-SI+.8002``
     - ``NXP iMX8M Plus on Toradex Verdin SoM``
+   * - Rocket
+     - ``RK3588 NPU``
+     - ``PINE64 QuartzPro64``

 .. list-table:: Tested models
   :header-rows: 1
@ -25,29 +28,33 @@ Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate
     - Status
     - Inference speed on AML-A311D-CC Alta
     - Inference speed on Verdin iMX8M Plus
+     - Inference speed on QuartzPro64
   * - MobileNet V1
     - UINT8
     - http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz
     - Fully supported
     - ~6.6 ms
     - ~7.9 ms
+     - ~18 ms
   * - MobileNet V2
     - UINT8
     - https://storage.googleapis.com/mobilenet_v2/checkpoints/quantized_v2_224_100.tgz
     - Fully supported
     - ~6.9 ms
     - ~8.0 ms
+     - ~21 ms
   * - SSDLite MobileDet
     - UINT8
     - https://raw.githubusercontent.com/google-coral/test_data/master/ssdlite_mobiledet_coco_qat_postprocess.tflite
     - Fully supported
     - ~24.8 ms
     - ~24.4 ms
+     - ~48 ms

 Build
 -----

-Build Mesa as usual, with the -Dteflon=true argument.
+Build Mesa as usual, with the -Dteflon=true argument. Make sure at least one of etnaviv or rocket gallium drivers is enabled, as Teflon only works with these drivers.

 Example instructions:

@ -62,7 +69,7 @@ Example instructions:

   # Build Mesa
   ~ $ cd mesa
-   mesa $ meson setup build -Dgallium-drivers=etnaviv -Dvulkan-drivers= -Dteflon=true
+   mesa $ meson setup build -Dgallium-drivers=etnaviv,rocket -Dvulkan-drivers= -Dteflon=true
   mesa $ meson compile -C build

 Install runtime dependencies
@ -99,7 +106,7 @@ This example script has been based from the code in https://github.com/tensorflo
   ~ $ cd mesa/
   mesa $ TEFLON_DEBUG=verbose ETNA_MESA_DEBUG=ml_dbgs python3.10 src/gallium/frontends/teflon/tests/classification.py \
          -i ~/tensorflow/assets/grace_hopper.bmp \
-          -m src/gallium/targets/teflon/tests/mobilenet_v1_1.0_224_quant.tflite \
+          -m src/gallium/targets/teflon/tests/models/mobilenetv1/mobilenet_v1_1_224_quant.tflite \
          -l src/gallium/frontends/teflon/tests/labels_mobilenet_quant_v1_224.txt \
          -e build/src/gallium/targets/teflon/libteflon.so

--- a/include/drm-uapi/rknpu_ioctl.h
+++ b/include/drm-uapi/rknpu_ioctl.h
@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) Fuzhou Rockchip Electronics Co.Ltd
+ * Author: Felix Zeng <felix.zeng@rock-chips.com>
+ */
+
+#ifndef __LINUX_RKNPU_IOCTL_H
+#define __LINUX_RKNPU_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+#define RKNPU_OFFSET_VERSION 0x0
+#define RKNPU_OFFSET_PC_OP_EN 0x8
+#define RKNPU_OFFSET_PC_DATA_ADDR 0x10
+#define RKNPU_OFFSET_PC_DATA_AMOUNT 0x14
+#define RKNPU_OFFSET_PC_TASK_CONTROL 0x30
+#define RKNPU_OFFSET_PC_DMA_BASE_ADDR 0x34
+#define RKNPU_OFFSET_PC_TASK_STATUS 0x3c
+
+#define RKNPU_OFFSET_INT_MASK 0x20
+#define RKNPU_OFFSET_INT_CLEAR 0x24
+#define RKNPU_OFFSET_INT_STATUS 0x28
+#define RKNPU_OFFSET_INT_RAW_STATUS 0x2c
+
+#define RKNPU_OFFSET_CLR_ALL_RW_AMOUNT 0x8010
+#define RKNPU_OFFSET_DT_WR_AMOUNT 0x8034
+#define RKNPU_OFFSET_DT_RD_AMOUNT 0x8038
+#define RKNPU_OFFSET_WT_RD_AMOUNT 0x803c
+
+#define RKNPU_OFFSET_ENABLE_MASK 0xf008
+
+#define RKNPU_INT_CLEAR 0x1ffff
+
+#define RKNPU_PC_DATA_EXTRA_AMOUNT 4
+
+#define RKNPU_STR_HELPER(x) #x
+
+#define RKNPU_GET_DRV_VERSION_STRING(MAJOR, MINOR, PATCHLEVEL)                 \
+	RKNPU_STR_HELPER(MAJOR)                                                \
+	"." RKNPU_STR_HELPER(MINOR) "." RKNPU_STR_HELPER(PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_CODE(MAJOR, MINOR, PATCHLEVEL)                   \
+	(MAJOR * 10000 + MINOR * 100 + PATCHLEVEL)
+#define RKNPU_GET_DRV_VERSION_MAJOR(CODE) (CODE / 10000)
+#define RKNPU_GET_DRV_VERSION_MINOR(CODE) ((CODE % 10000) / 100)
+#define RKNPU_GET_DRV_VERSION_PATCHLEVEL(CODE) (CODE % 100)
+
+/* memory type definitions. */
+enum e_rknpu_mem_type {
+	/* physically continuous memory and used as default. */
+	RKNPU_MEM_CONTIGUOUS = 0 << 0,
+	/* physically non-continuous memory. */
+	RKNPU_MEM_NON_CONTIGUOUS = 1 << 0,
+	/* non-cacheable mapping and used as default. */
+	RKNPU_MEM_NON_CACHEABLE = 0 << 1,
+	/* cacheable mapping. */
+	RKNPU_MEM_CACHEABLE = 1 << 1,
+	/* write-combine mapping. */
+	RKNPU_MEM_WRITE_COMBINE = 1 << 2,
+	/* dma attr kernel mapping */
+	RKNPU_MEM_KERNEL_MAPPING = 1 << 3,
+	/* iommu mapping */
+	RKNPU_MEM_IOMMU = 1 << 4,
+	/* zero mapping */
+	RKNPU_MEM_ZEROING = 1 << 5,
+	/* allocate secure buffer */
+	RKNPU_MEM_SECURE = 1 << 6,
+	/* allocate from non-dma32 zone */
+	RKNPU_MEM_NON_DMA32 = 1 << 7,
+	RKNPU_MEM_MASK = RKNPU_MEM_NON_CONTIGUOUS | RKNPU_MEM_CACHEABLE |
+			 RKNPU_MEM_WRITE_COMBINE | RKNPU_MEM_KERNEL_MAPPING |
+			 RKNPU_MEM_IOMMU | RKNPU_MEM_ZEROING |
+			 RKNPU_MEM_SECURE | RKNPU_MEM_NON_DMA32
+};
+
+/* sync mode definitions. */
+enum e_rknpu_mem_sync_mode {
+	RKNPU_MEM_SYNC_TO_DEVICE = 1 << 0,
+	RKNPU_MEM_SYNC_FROM_DEVICE = 1 << 1,
+	RKNPU_MEM_SYNC_MASK =
+		RKNPU_MEM_SYNC_TO_DEVICE | RKNPU_MEM_SYNC_FROM_DEVICE
+};
+
+/* job mode definitions. */
+enum e_rknpu_job_mode {
+	RKNPU_JOB_SLAVE = 0 << 0,
+	RKNPU_JOB_PC = 1 << 0,
+	RKNPU_JOB_BLOCK = 0 << 1,
+	RKNPU_JOB_NONBLOCK = 1 << 1,
+	RKNPU_JOB_PINGPONG = 1 << 2,
+	RKNPU_JOB_FENCE_IN = 1 << 3,
+	RKNPU_JOB_FENCE_OUT = 1 << 4,
+	RKNPU_JOB_MASK = RKNPU_JOB_PC | RKNPU_JOB_NONBLOCK |
+			 RKNPU_JOB_PINGPONG | RKNPU_JOB_FENCE_IN |
+			 RKNPU_JOB_FENCE_OUT
+};
+
+/* action definitions */
+enum e_rknpu_action {
+	RKNPU_GET_HW_VERSION = 0,
+	RKNPU_GET_DRV_VERSION = 1,
+	RKNPU_GET_FREQ = 2,
+	RKNPU_SET_FREQ = 3,
+	RKNPU_GET_VOLT = 4,
+	RKNPU_SET_VOLT = 5,
+	RKNPU_ACT_RESET = 6,
+	RKNPU_GET_BW_PRIORITY = 7,
+	RKNPU_SET_BW_PRIORITY = 8,
+	RKNPU_GET_BW_EXPECT = 9,
+	RKNPU_SET_BW_EXPECT = 10,
+	RKNPU_GET_BW_TW = 11,
+	RKNPU_SET_BW_TW = 12,
+	RKNPU_ACT_CLR_TOTAL_RW_AMOUNT = 13,
+	RKNPU_GET_DT_WR_AMOUNT = 14,
+	RKNPU_GET_DT_RD_AMOUNT = 15,
+	RKNPU_GET_WT_RD_AMOUNT = 16,
+	RKNPU_GET_TOTAL_RW_AMOUNT = 17,
+	RKNPU_GET_IOMMU_EN = 18,
+	RKNPU_SET_PROC_NICE = 19,
+	RKNPU_POWER_ON = 20,
+	RKNPU_POWER_OFF = 21,
+};
+
+/**
+ * User-desired buffer creation information structure.
+ *
+ * @handle: The handle of the created GEM object.
+ * @flags: user request for setting memory type or cache attributes.
+ * @size: user-desired memory allocation size.
+ *	- this size value would be page-aligned internally.
+ * @obj_addr: address of RKNPU memory object.
+ * @dma_addr: dma address that access by rknpu.
+ */
+struct rknpu_mem_create {
+	__u32 handle;
+	__u32 flags;
+	__u64 size;
+	__u64 obj_addr;
+	__u64 dma_addr;
+};
+
+/**
+ * A structure for getting a fake-offset that can be used with mmap.
+ *
+ * @handle: handle of gem object.
+ * @reserved: just padding to be 64-bit aligned.
+ * @offset: a fake-offset of gem object.
+ */
+struct rknpu_mem_map {
+	__u32 handle;
+	__u32 reserved;
+	__u64 offset;
+};
+
+/**
+ * For destroying DMA buffer
+ *
+ * @handle:	handle of the buffer.
+ * @reserved: reserved for padding.
+ * @obj_addr: rknpu_mem_object addr.
+ */
+struct rknpu_mem_destroy {
+	__u32 handle;
+	__u32 reserved;
+	__u64 obj_addr;
+};
+
+/**
+ * For synchronizing DMA buffer
+ *
+ * @flags: user request for setting memory type or cache attributes.
+ * @reserved: reserved for padding.
+ * @obj_addr: address of RKNPU memory object.
+ * @offset: offset in bytes from start address of buffer.
+ * @size: size of memory region.
+ *
+ */
+struct rknpu_mem_sync {
+	__u32 flags;
+	__u32 reserved;
+	__u64 obj_addr;
+	__u64 offset;
+	__u64 size;
+};
+
+/**
+ * struct rknpu_task structure for task information
+ *
+ * @flags: flags for task
+ * @op_idx: operator index
+ * @enable_mask: enable mask
+ * @int_mask: interrupt mask
+ * @int_clear: interrupt clear
+ * @int_status: interrupt status
+ * @regcfg_amount: register config number
+ * @regcfg_offset: offset for register config
+ * @regcmd_addr: address for register command
+ *
+ */
+struct rknpu_task {
+	__u32 flags;
+	__u32 op_idx;
+	__u32 enable_mask;
+	__u32 int_mask;
+	__u32 int_clear;
+	__u32 int_status;
+	__u32 regcfg_amount;
+	__u32 regcfg_offset;
+	__u64 regcmd_addr;
+} __packed;
+
+/**
+ * struct rknpu_subcore_task structure for subcore task index
+ *
+ * @task_start: task start index
+ * @task_number: task number
+ *
+ */
+struct rknpu_subcore_task {
+	__u32 task_start;
+	__u32 task_number;
+};
+
+/**
+ * struct rknpu_submit structure for job submit
+ *
+ * @flags: flags for job submit
+ * @timeout: submit timeout
+ * @task_start: task start index
+ * @task_number: task number
+ * @task_counter: task counter
+ * @priority: submit priority
+ * @task_obj_addr: address of task object
+ * @regcfg_obj_addr: address of register config object
+ * @task_base_addr: task base address
+ * @user_data: (optional) user data
+ * @core_mask: core mask of rknpu
+ * @fence_fd: dma fence fd
+ * @subcore_task: subcore task
+ *
+ */
+struct rknpu_submit {
+	__u32 flags;
+	__u32 timeout;
+	__u32 task_start;
+	__u32 task_number;
+	__u32 task_counter;
+	__s32 priority;
+	__u64 task_obj_addr;
+	__u64 regcfg_obj_addr;
+	__u64 task_base_addr;
+	__u64 user_data;
+	__u32 core_mask;
+	__s32 fence_fd;
+	struct rknpu_subcore_task subcore_task[5];
+};
+
+/**
+ * struct rknpu_task structure for action (GET, SET or ACT)
+ *
+ * @flags: flags for action
+ * @value: GET or SET value
+ *
+ */
+struct rknpu_action {
+	__u32 flags;
+	__u32 value;
+};
+
+#define RKNPU_ACTION 0x00
+#define RKNPU_SUBMIT 0x01
+#define RKNPU_MEM_CREATE 0x02
+#define RKNPU_MEM_MAP 0x03
+#define RKNPU_MEM_DESTROY 0x04
+#define RKNPU_MEM_SYNC 0x05
+
+#define RKNPU_IOC_MAGIC 'r'
+#define RKNPU_IOW(nr, type) _IOW(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOR(nr, type) _IOR(RKNPU_IOC_MAGIC, nr, type)
+#define RKNPU_IOWR(nr, type) _IOWR(RKNPU_IOC_MAGIC, nr, type)
+
+#include <drm.h>
+
+#define DRM_IOCTL_RKNPU_ACTION                                                 \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_ACTION, struct rknpu_action)
+#define DRM_IOCTL_RKNPU_SUBMIT                                                 \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_SUBMIT, struct rknpu_submit)
+#define DRM_IOCTL_RKNPU_MEM_CREATE                                             \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define DRM_IOCTL_RKNPU_MEM_MAP                                                \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define DRM_IOCTL_RKNPU_MEM_DESTROY                                            \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define DRM_IOCTL_RKNPU_MEM_SYNC                                               \
+	DRM_IOWR(DRM_COMMAND_BASE + RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#define IOCTL_RKNPU_ACTION RKNPU_IOWR(RKNPU_ACTION, struct rknpu_action)
+#define IOCTL_RKNPU_SUBMIT RKNPU_IOWR(RKNPU_SUBMIT, struct rknpu_submit)
+#define IOCTL_RKNPU_MEM_CREATE                                                 \
+	RKNPU_IOWR(RKNPU_MEM_CREATE, struct rknpu_mem_create)
+#define IOCTL_RKNPU_MEM_MAP RKNPU_IOWR(RKNPU_MEM_MAP, struct rknpu_mem_map)
+#define IOCTL_RKNPU_MEM_DESTROY                                                \
+	RKNPU_IOWR(RKNPU_MEM_DESTROY, struct rknpu_mem_destroy)
+#define IOCTL_RKNPU_MEM_SYNC RKNPU_IOWR(RKNPU_MEM_SYNC, struct rknpu_mem_sync)
+
+#endif
--- a/include/drm-uapi/rocket_accel.h
+++ b/include/drm-uapi/rocket_accel.h
@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Tomeu Vizoso
+ */
+#ifndef __DRM_UAPI_ROCKET_ACCEL_H__
+#define __DRM_UAPI_ROCKET_ACCEL_H__
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_ROCKET_CREATE_BO			0x00
+#define DRM_ROCKET_SUBMIT			0x01
+#define DRM_ROCKET_PREP_BO			0x02
+#define DRM_ROCKET_FINI_BO			0x03
+
+#define DRM_IOCTL_ROCKET_CREATE_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo)
+#define DRM_IOCTL_ROCKET_SUBMIT			DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit)
+#define DRM_IOCTL_ROCKET_PREP_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_PREP_BO, struct drm_rocket_prep_bo)
+#define DRM_IOCTL_ROCKET_FINI_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_FINI_BO, struct drm_rocket_fini_bo)
+
+/**
+ * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs.
+ *
+ */
+struct drm_rocket_create_bo {
+	/** Input: Size of the requested BO. */
+	__u32 size;
+
+	/** Output: GEM handle for the BO. */
+	__u32 handle;
+
+	/**
+	 * Output: DMA address for the BO in the NPU address space.  This address
+	 * is private to the DRM fd and is valid for the lifetime of the GEM
+	 * handle.
+	 */
+	__u64 dma_address;
+
+	/** Output: Offset into the drm node to use for subsequent mmap call. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_rocket_prep_bo - ioctl argument for starting CPU ownership of the BO.
+ *
+ * Takes care of waiting for any NPU jobs that might still use the NPU and performs cache
+ * synchronization.
+ */
+struct drm_rocket_prep_bo {
+	/** Input: GEM handle of the buffer object. */
+	__u32 handle;
+
+	/** Reserved, must be zero. */
+	__u32 reserved;
+
+	/** Input: Amount of time to wait for NPU jobs. */
+	__s64 timeout_ns;
+};
+
+/**
+ * struct drm_rocket_fini_bo - ioctl argument for finishing CPU ownership of the BO.
+ *
+ * Synchronize caches for NPU access.
+ */
+struct drm_rocket_fini_bo {
+	/** Input: GEM handle of the buffer object. */
+	__u32 handle;
+
+	/** Reserved, must be zero. */
+	__u32 reserved;
+};
+
+/**
+ * struct drm_rocket_task - A task to be run on the NPU
+ *
+ * A task is the smallest unit of work that can be run on the NPU.
+ */
+struct drm_rocket_task {
+	/** Input: DMA address to NPU mapping of register command buffer */
+	__u32 regcmd;
+
+	/** Input: Number of commands in the register command buffer */
+	__u32 regcmd_count;
+};
+
+/**
+ * struct drm_rocket_job - A job to be run on the NPU
+ *
+ * The kernel will schedule the execution of this job taking into account its
+ * dependencies with other jobs. All tasks in the same job will be executed
+ * sequentially on the same core, to benefit from memory residency in SRAM.
+ */
+struct drm_rocket_job {
+	/** Input: Pointer to an array of struct drm_rocket_task. */
+	__u64 tasks;
+
+	/** Input: Pointer to a u32 array of the BOs that are read by the job. */
+	__u64 in_bo_handles;
+
+	/** Input: Pointer to a u32 array of the BOs that are written to by the job. */
+	__u64 out_bo_handles;
+
+	/** Input: Number of tasks passed in. */
+	__u32 task_count;
+
+	/** Input: Size in bytes of the structs in the @tasks field. */
+	__u32 task_struct_size;
+
+	/** Input: Number of input BO handles passed in (size is that times 4). */
+	__u32 in_bo_handle_count;
+
+	/** Input: Number of output BO handles passed in (size is that times 4). */
+	__u32 out_bo_handle_count;
+};
+
+/**
+ * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU.
+ *
+ * The kernel will schedule the execution of these jobs in dependency order.
+ */
+struct drm_rocket_submit {
+	/** Input: Pointer to an array of struct drm_rocket_job. */
+	__u64 jobs;
+
+	/** Input: Number of jobs passed in. */
+	__u32 job_count;
+
+	/** Input: Size in bytes of the structs in the @jobs field. */
+	__u32 job_struct_size;
+
+	/** Reserved, must be zero. */
+	__u64 reserved;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __DRM_UAPI_ROCKET_ACCEL_H__ */
--- a/meson.build
+++ b/meson.build
@ -181,7 +181,7 @@ elif gallium_drivers.contains('all')
   gallium_drivers = [
     'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
     'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
-     'zink', 'd3d12', 'asahi'
+     'zink', 'd3d12', 'asahi', 'rocket'
   ]
 endif

@ -208,6 +208,7 @@ with_gallium_lima = gallium_drivers.contains('lima')
 with_gallium_zink = gallium_drivers.contains('zink')
 with_gallium_d3d12 = gallium_drivers.contains('d3d12')
 with_gallium_asahi = gallium_drivers.contains('asahi')
+with_gallium_rocket = gallium_drivers.contains('rocket')
 foreach gallium_driver : gallium_drivers
  pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
--- a/meson.options
+++ b/meson.options
@ -82,7 +82,7 @@ option(
    'all', 'auto',
    'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris',
    'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
-    'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
+    'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
  ],
  description : 'List of gallium drivers to build. If this is set to auto ' +
                'all drivers applicable to the target OS/architecture ' +
--- a/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-fails.txt
@ -0,0 +1,126 @@
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_112_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_3_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_5_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_80_weight_size_5_input_channels_1_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_120_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_5_input_channels_32_output_channels_16_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_0_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0,Fail
+Add.Op/input_size_8_weight_size_3_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0,Fail
+Conv2D.Op/input_size_112_weight_size_1_input_channels_120_output_channels_120_stride_1_padding_same_1_is_signed_0,Fail
+
+Models.Op/mobiledet_086,Fail
+Models.Op/mobiledet_087,Fail
+Models.Op/mobiledet_ssdlite_mobiledet_coco_qat_postprocess,Fail
+
+Models.Op/yolox_005,Fail
+Models.Op/yolox_007,Fail
+Models.Op/yolox_008,Fail
+Models.Op/yolox_009,Fail
+Models.Op/yolox_010,Fail
+Models.Op/yolox_012,Fail
+Models.Op/yolox_014,Fail
+Models.Op/yolox_016,Fail
+Models.Op/yolox_018,Fail
+Models.Op/yolox_019,Fail
+Models.Op/yolox_021,Fail
+Models.Op/yolox_022,Fail
+Models.Op/yolox_024,Fail
+Models.Op/yolox_025,Fail
+Models.Op/yolox_031,Fail
+Models.Op/yolox_034,Fail
+Models.Op/yolox_037,Fail
+Models.Op/yolox_040,Fail
+Models.Op/yolox_046,Fail
+Models.Op/yolox_055,Fail
+Models.Op/yolox_064,Fail
+Models.Op/yolox_072,Fail
+Models.Op/yolox_073,Fail
+Models.Op/yolox_078,Fail
+Models.Op/yolox_082,Fail
+Models.Op/yolox_087,Fail
+Models.Op/yolox_091,Fail
+Models.Op/yolox_096,Fail
+Models.Op/yolox_097,Fail
+Models.Op/yolox_100,Fail
+Models.Op/yolox_101,Fail
+Models.Op/yolox_107,Fail
+Models.Op/yolox_108,Fail
+Models.Op/yolox_111,Fail
+Models.Op/yolox_112,Fail
+Models.Op/yolox_118,Fail
+Models.Op/yolox_119,Fail
+Models.Op/yolox_122,Fail
+Models.Op/yolox_123,Fail
+Models.Op/yolox_yolox,Fail
--- a/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-flakes.txt
@ -0,0 +1,5 @@
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_0_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_0_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_1_padding_same_1_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_32_output_channels_1_stride_2_padding_same_1_is_signed_0
+Add.Op/input_size_8_weight_size_1_input_channels_120_output_channels_1_stride_2_padding_same_0_is_signed_0
--- a/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt
+++ b/src/gallium/drivers/rocket/ci/rocket-rk3588-skips.txt
@ -0,0 +1,29 @@
+Add.Op/.*
+AddQuant.Op/.*
+Conv2D.Op/.*
+DepthwiseConv2D.Op/.*
+FullyConnected.Op/.*
+
+# These tests below (adds) aren't well constructed and thus fail in TF
+Models.Op/mobiledet_008
+Models.Op/mobiledet_011
+Models.Op/mobiledet_014
+Models.Op/mobiledet_019
+Models.Op/mobiledet_022
+Models.Op/mobiledet_025
+Models.Op/mobiledet_032
+Models.Op/mobiledet_035
+Models.Op/mobiledet_038
+Models.Op/mobiledet_045
+Models.Op/mobiledet_049
+Models.Op/mobiledet_053
+Models.Op/mobiledet_060
+Models.Op/mobiledet_064
+Models.Op/mobiledet_068
+Models.Op/yolox_011
+Models.Op/yolox_020
+Models.Op/yolox_023
+Models.Op/yolox_026
+Models.Op/yolox_035
+Models.Op/yolox_038
+Models.Op/yolox_041
--- a/src/gallium/drivers/rocket/decode.py
+++ b/src/gallium/drivers/rocket/decode.py
@ -0,0 +1,75 @@
+#!/usr/bin/python3
+#
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import struct
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--xml', type=str, required=True)
+	parser.add_argument('--dump', type=str, required=True)
+
+	args = parser.parse_args()
+
+	p = Parser()
+
+	try:
+		p.parse("", args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	regs = {}
+	for e in p.file:
+		if isinstance(e, Reg):
+			regs[e.offset] = e
+
+	domains = {}
+	for e in p.file:
+		if isinstance(e, Enum):
+			if e.name == "target":
+				for name, val in e.values:
+					domains[name] = val
+
+	f = open(args.dump, mode='rb')
+	for i in range(0, os.path.getsize(args.dump) // 8):
+		cmd = f.read(8)
+		(offset, value, target) = struct.unpack("<hIh", cmd)
+		if offset in regs.keys():
+			reg = regs[offset]
+
+			if (target & 0xfffffffe) != domains[reg.domain]:
+				print("WARNING: target 0x%x doesn't match register's domain 0x%x" % (target, domains[reg.domain]))
+
+			print("EMIT(REG_%s, " % regs[offset].full_name.upper(), end="")
+			first = True
+			if value == 0 or len(reg.bitset.fields) == 1:
+				print("0x%x" % value, end="")
+			else:
+				for field in reg.bitset.fields:
+					if field.type == "boolean":
+						if 1 << field.high & value:
+							if not first:
+								print(" | ", end="")
+							print("%s_%s" % (reg.full_name.upper(), field.name.upper()), end="")
+							first = False
+					elif field.type == "uint":
+						field_value = (value & mask(field.low, field.high)) >> field.low
+						if field_value != 0:
+							if not first:
+								print(" | ", end="")
+							print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="")
+							first = False
+			print(");")
+		else:
+			print("%x %x %x" % (target, offset, value))
+
+if __name__ == '__main__':
+	main()
--- a/src/gallium/drivers/rocket/extract_registers.py
+++ b/src/gallium/drivers/rocket/extract_registers.py
@ -0,0 +1,121 @@
+#!/usr/bin/python3
+#
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import collections
+import csv
+import subprocess
+import sys
+from itertools import dropwhile
+import camelot
+
+trm_file = sys.argv[1]
+if trm_file.endswith(".pdf"):
+    data = subprocess.check_output(["pdftotext", "-tsv", sys.argv[1], "-"]).decode()
+else:
+    assert(trm_file.endswith(".txt"))
+    data = open(sys.argv[1]).read()
+
+data = csv.reader(data.splitlines(), delimiter="\t")
+data = collections.deque([x[11] for x in data])
+
+def popcell(data):
+    cell = []
+    while data[0] != "###FLOW###":
+        text = data.popleft()
+        cell.append(text)
+    data.popleft() ###FLOW###
+    data.popleft() ###LINE###
+    return cell
+
+text = None
+while data[0] != "RKNN_pc_operation_enable":
+    data.popleft()
+
+def read_reg_offset(data):
+    while data:
+        text = data.popleft()
+        if text.startswith("(0x"):
+            return text.replace("(", "").replace(")", "")
+
+reg_names = []
+offsets = []
+while text != "RKNN_global_operation_enable":
+    text = data.popleft()
+
+    if text.startswith("RKNN_"):
+        reg_names.append(text)
+        offsets.append(read_reg_offset(data))
+
+print("Found %d registers in RKNN block" % len(reg_names))
+
+"""
+print(reg_names)
+print(offsets)
+sys.exit(0)
+"""
+
+tables = camelot.read_pdf(sys.argv[1], line_scale=35, pages="0-60")
+tables = collections.deque([x.data for x in tables[3:]])
+
+# Join tables split by page breaks
+new_tables = []
+while tables:
+    new_table = tables.popleft()
+    last_bitfield = new_table[-1][0].split(" ")[0]
+    while last_bitfield != "0" and not last_bitfield.endswith(":0"):
+        second_part = tables.popleft()
+        new_table.extend(second_part[1:])
+        last_bitfield = second_part[-1][0].split(" ")[0]
+    new_tables.append(new_table)
+tables = new_tables
+print("Found %d tables in PDF" % len(tables))
+
+domains = {}
+for i in range(0, len(reg_names)):
+    reg_name = reg_names[i]
+    if "dpu_rdma" in reg_name:
+        domain = "dpu_rdma"
+    elif "ppu_rdma" in reg_name:
+        domain = "ppu_rdma"
+    else:
+        domain = reg_name.split("_")[1]
+    table = tables[i]
+
+    if domain not in domains.keys():
+        domains[domain] = []
+
+    reg = {}
+    reg["name"] = reg_name
+    reg["offset"] = offsets[i]
+    reg["field_names"] = []
+    reg["field_bits"] = []
+
+    reserved_count = 0
+    for row in table[1:]:
+        name = row[3].split('\n')[0]
+
+        if name == "reserved":
+            name = "reserved_%d" % reserved_count
+            reserved_count += 1
+
+        reg["field_bits"].append(row[0].split(' ')[0])
+        reg["field_names"].append(name)
+
+    domains[domain].append(reg)
+
+for domain in domains.keys():
+    print('    <domain name="%s" width="32">' % domain.upper())
+    for reg in domains[domain]:
+        print('        <reg32 offset="%s" name="%s">' % (reg["offset"], "_".join(reg["name"].strip().upper().split("_")[2:])))
+        for i in range(0, len(reg["field_names"])):
+            if ":" in reg["field_bits"][i]:
+                high, low = reg["field_bits"][i].split(":")
+                bits = 'low="%s" high="%s"' % (low, high)
+            else:
+                bits = 'pos="%s"' % reg["field_bits"][i]
+            print('            <bitfield name="%s" %s type="uint"/>' % (reg["field_names"][i].strip().upper(), bits))
+        print('        </reg32>')
+    print('    </domain>')
--- a/src/gallium/drivers/rocket/gen_header.py
+++ b/src/gallium/drivers/rocket/gen_header.py
@ -0,0 +1,137 @@
+#!/usr/bin/python3
+#
+# Copyright © 2019-2024 Google, Inc.
+# Copyright © 2024-2025 Tomeu Vizoso
+#
+# SPDX-License-Identifier: MIT
+
+import sys
+import os
+import argparse
+import time
+import datetime
+from gen_parser import Parser, Reg, Enum, mask, Error
+
+
+def dump_c(args, guard, func):
+	p = Parser()
+
+	try:
+		p.parse(args.rnn, args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	print("#ifndef %s\n#define %s\n" % (guard, guard))
+
+	print("""/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng gen_header.py tool in this git repository:
+http://gitlab.freedesktop.org/mesa/mesa/
+git clone https://gitlab.freedesktop.org/mesa/mesa.git
+
+The rules-ng-ng source files this header was generated from are:
+""")
+	maxlen = 0
+	for filepath in p.xml_files:
+		maxlen = max(maxlen, len(filepath))
+	for filepath in p.xml_files:
+		pad = " " * (maxlen - len(filepath))
+		filesize = str(os.path.getsize(filepath))
+		filesize = " " * (7 - len(filesize)) + filesize
+		filetime = time.ctime(os.path.getmtime(filepath))
+		print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")")
+	if p.copyright_year:
+		current_year = str(datetime.date.today().year)
+		print()
+		print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year))
+		for author in p.authors:
+			print("- " + author)
+	if p.license:
+		print(p.license)
+	print("*/")
+
+	print()
+	print("#ifdef __KERNEL__")
+	print("#include <linux/bug.h>")
+	print("#define assert(x) BUG_ON(!(x))")
+	print("#else")
+	print("#include <assert.h>")
+	print("#endif")
+	print()
+
+	print("#ifdef __cplusplus")
+	print("#define __struct_cast(X)")
+	print("#else")
+	print("#define __struct_cast(X) (struct X)")
+	print("#endif")
+	print()
+
+	func(p)
+
+	print("static uint32_t rkt_get_target(uint32_t offset)")
+	print("{")
+
+	print("\tswitch(offset) {")
+	for e in p.file:
+		if isinstance(e, Reg):
+			print("\t\tcase REG_%s:" % e.full_name)
+			print("\t\t\treturn %s;" % e.domain)
+	print("\t}")
+	print("\treturn 0;")
+	print("}")
+
+	print("\n#endif /* %s */" % guard)
+
+
+def dump_c_defines(args):
+	guard = str.replace(os.path.basename(args.xml), '.', '_').upper()
+	dump_c(args, guard, lambda p: p.dump())
+
+
+def dump_c_pack_structs(args):
+	guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS'
+	dump_c(args, guard, lambda p: p.dump_structs())
+
+
+def dump_py_defines(args):
+	p = Parser()
+
+	try:
+		p.parse(args.rnn, args.xml)
+	except Error as e:
+		print(e, file=sys.stderr)
+		exit(1)
+
+	file_name = os.path.splitext(os.path.basename(args.xml))[0]
+
+	print("from enum import IntEnum")
+	print("class %sRegs(IntEnum):" % file_name.upper())
+
+	os.path.basename(args.xml)
+
+	p.dump_regs_py()
+
+
+def main():
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--rnn', type=str, required=True)
+	parser.add_argument('--xml', type=str, required=True)
+
+	subparsers = parser.add_subparsers(required=True)
+
+	parser_c_defines = subparsers.add_parser('c-defines')
+	parser_c_defines.set_defaults(func=dump_c_defines)
+
+	parser_c_pack_structs = subparsers.add_parser('c-pack-structs')
+	parser_c_pack_structs.set_defaults(func=dump_c_pack_structs)
+
+	parser_py_defines = subparsers.add_parser('py-defines')
+	parser_py_defines.set_defaults(func=dump_py_defines)
+
+	args = parser.parse_args()
+	args.func(args)
+
+
+if __name__ == '__main__':
+	main()
--- a/src/gallium/drivers/rocket/gen_parser.py
+++ b/src/gallium/drivers/rocket/gen_parser.py
@ -0,0 +1,737 @@
+import xml.parsers.expat
+import sys
+import os
+import collections
+
+class Error(Exception):
+	def __init__(self, message):
+		self.message = message
+
+class Enum(object):
+	def __init__(self, name):
+		self.name = name
+		self.values = []
+
+	def has_name(self, name):
+		for (n, value) in self.values:
+			if n == name:
+				return True
+		return False
+
+	def dump(self):
+		use_hex = False
+		for (name, value) in self.values:
+			if value > 0x1000:
+				use_hex = True
+
+		print("enum %s {" % self.name)
+		for (name, value) in self.values:
+			if use_hex:
+				print("\t%s = 0x%08x," % (name, value))
+			else:
+				print("\t%s = %d," % (name, value))
+		print("};\n")
+
+	def dump_pack_struct(self):
+		pass
+
+class Field(object):
+	def __init__(self, name, low, high, shr, type, parser):
+		self.name = name
+		self.low = low
+		self.high = high
+		self.shr = shr
+		self.type = type
+
+		builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ]
+
+		maxpos = parser.current_bitsize - 1
+
+		if low < 0 or low > maxpos:
+			raise parser.error("low attribute out of range: %d" % low)
+		if high < 0 or high > maxpos:
+			raise parser.error("high attribute out of range: %d" % high)
+		if high < low:
+			raise parser.error("low is greater than high: low=%d, high=%d" % (low, high))
+		if self.type == "boolean" and not low == high:
+			raise parser.error("booleans should be 1 bit fields")
+		elif self.type == "float" and not (high - low == 31 or high - low == 15):
+			raise parser.error("floats should be 16 or 32 bit fields")
+		elif not self.type in builtin_types and not self.type in parser.enums:
+			raise parser.error("unknown type '%s'" % self.type)
+
+	def ctype(self, var_name):
+		if self.type == None:
+			type = "uint32_t"
+			val = var_name
+		elif self.type == "boolean":
+			type = "bool"
+			val = var_name
+		elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid":
+			type = "uint32_t"
+			val = var_name
+		elif self.type == "int":
+			type = "int32_t"
+			val = var_name
+		elif self.type == "fixed":
+			type = "float"
+			val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+		elif self.type == "ufixed":
+			type = "float"
+			val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
+		elif self.type == "float" and self.high - self.low == 31:
+			type = "float"
+			val = "fui(%s)" % var_name
+		elif self.type == "float" and self.high - self.low == 15:
+			type = "float"
+			val = "_mesa_float_to_half(%s)" % var_name
+		elif self.type in [ "address", "waddress" ]:
+			type = "uint64_t"
+			val = var_name
+		else:
+			type = "enum %s" % self.type
+			val = var_name
+
+		if self.shr > 0:
+			val = "(%s >> %d)" % (val, self.shr)
+
+		return (type, val)
+
+def tab_to(name, value):
+	tab_count = (68 - (len(name) & ~7)) // 8
+	if tab_count <= 0:
+		tab_count = 1
+	print(name + ('\t' * tab_count) + value)
+
+def mask(low, high):
+	return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low)
+
+def field_name(reg, f):
+	if f.name:
+		name = f.name.lower()
+	else:
+		# We hit this path when a reg is defined with no bitset fields, ie.
+		# 	<reg32 offset="0x88db" name="RB_BLIT_DST_ARRAY_PITCH" low="0" high="28" shr="6" type="uint"/>
+		name = reg.name.lower()
+
+	if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()):
+			name = "_" + name
+
+	return name
+
+class Bitset(object):
+	def __init__(self, name, template):
+		self.name = name
+		self.inline = False
+		if template:
+			self.fields = template.fields[:]
+		else:
+			self.fields = []
+
+	# Get address field if there is one in the bitset, else return None:
+	def get_address_field(self):
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				return f
+		return None
+
+	def dump_regpair_builder(self, reg):
+		print("#ifndef NDEBUG")
+		known_mask = 0
+		for f in self.fields:
+			known_mask |= mask(f.low, f.high)
+			if f.type in [ "boolean", "address", "waddress" ]:
+				continue
+			type, val = f.ctype("fields.%s" % field_name(reg, f))
+			print("    assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low)))
+		print("    assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask))
+		print("#endif\n")
+
+		print("    return (struct fd_reg_pair) {")
+		if reg.array:
+			print("        .reg = REG_%s(__i)," % reg.full_name)
+		else:
+			print("        .reg = REG_%s," % reg.full_name)
+
+		print("        .value =")
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				continue
+			else:
+				type, val = f.ctype("fields.%s" % field_name(reg, f))
+				print("            (%-40s << %2d) |" % (val, f.low))
+		value_name = "dword"
+		if reg.bit_size == 64:
+			value_name = "qword"
+		print("            fields.unknown | fields.%s," % (value_name,))
+
+		address = self.get_address_field()
+		if address:
+			print("        .bo = fields.bo,")
+			print("        .is_address = true,")
+			if f.type == "waddress":
+				print("        .bo_write = true,")
+			print("        .bo_offset = fields.bo_offset,")
+			print("        .bo_shift = %d," % address.shr)
+			print("        .bo_low = %d," % address.low)
+
+		print("    };")
+
+	def dump_pack_struct(self, reg=None):
+		if not reg:
+			return
+
+		prefix = reg.full_name
+
+		print("struct %s {" % prefix)
+		for f in self.fields:
+			if f.type in [ "address", "waddress" ]:
+				tab_to("    __bo_type", "bo;")
+				tab_to("    uint32_t", "bo_offset;")
+				continue
+			name = field_name(reg, f)
+
+			type, val = f.ctype("var")
+
+			tab_to("    %s" % type, "%s;" % name)
+		if reg.bit_size == 64:
+			tab_to("    uint64_t", "unknown;")
+			tab_to("    uint64_t", "qword;")
+		else:
+			tab_to("    uint32_t", "unknown;")
+			tab_to("    uint32_t", "dword;")
+		print("};\n")
+
+		if reg.array:
+			print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" %
+				  (prefix, prefix))
+		else:
+			print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" %
+				  (prefix, prefix))
+
+		self.dump_regpair_builder(reg)
+
+		print("\n}\n")
+
+		if self.get_address_field():
+			skip = ", { .reg = 0 }"
+		else:
+			skip = ""
+
+		if reg.array:
+			print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" %
+				  (prefix, prefix, prefix, skip))
+		else:
+			print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" %
+				  (prefix, prefix, prefix, skip))
+
+
+	def dump(self, prefix=None):
+		if prefix == None:
+			prefix = self.name
+		for f in self.fields:
+			if f.name:
+				name = prefix + "_" + f.name
+			else:
+				name = prefix
+
+			if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]:
+				pass
+			elif f.type == "boolean" or (f.type == None and f.low == f.high):
+				tab_to("#define %s" % name, "0x%08x" % (1 << f.low))
+			else:
+				tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high))
+				tab_to("#define %s__SHIFT" % name, "%d" % f.low)
+				type, val = f.ctype("val")
+
+				print("static inline uint32_t %s(%s val)\n{" % (name, type))
+				if f.shr > 0:
+					print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1))
+				print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name))
+		print()
+
+class Array(object):
+	def __init__(self, attrs, domain, variant):
+		if "name" in attrs:
+			self.name = attrs["name"]
+		else:
+			self.name = ""
+		self.domain = domain
+		self.variant = variant
+		self.offset = int(attrs["offset"], 0)
+		self.stride = int(attrs["stride"], 0)
+		self.length = int(attrs["length"], 0)
+		if "usage" in attrs:
+			self.usages = attrs["usage"].split(',')
+		else:
+			self.usages = None
+
+	def dump(self):
+		print("#define REG_%s_%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.domain, self.name, self.offset, self.stride))
+
+	def dump_pack_struct(self):
+		pass
+
+	def dump_regpair_builder(self):
+		pass
+
+class Reg(object):
+	def __init__(self, attrs, domain, array, bit_size):
+		self.name = attrs["name"]
+		self.domain = domain
+		self.array = array
+		self.offset = int(attrs["offset"], 0)
+		self.type = None
+		self.bit_size = bit_size
+		if array:
+			self.name = array.name + "_" + self.name
+		self.full_name = self.domain + "_" + self.name
+
+	def dump(self):
+		if self.array:
+			offset = self.array.offset + self.offset
+			print("static inline uint32_t REG_%s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride))
+		else:
+			tab_to("#define REG_%s" % self.full_name, "0x%08x" % self.offset)
+
+		if self.bitset.inline:
+			self.bitset.dump(self.full_name)
+
+	def dump_pack_struct(self):
+		if self.bitset.inline:
+			self.bitset.dump_pack_struct(self)
+
+	def dump_regpair_builder(self):
+		if self.bitset.inline:
+			self.bitset.dump_regpair_builder(self)
+
+	def dump_py(self):
+		print("\tREG_%s = 0x%08x" % (self.full_name, self.offset))
+
+
+class Parser(object):
+	def __init__(self):
+		self.current_array = None
+		self.current_domain = None
+		self.current_prefix = None
+		self.current_prefix_type = None
+		self.current_stripe = None
+		self.current_bitset = None
+		self.current_bitsize = 32
+		# The varset attribute on the domain specifies the enum which
+		# specifies all possible hw variants:
+		self.current_varset = None
+		# Regs that have multiple variants.. we only generated the C++
+		# template based struct-packers for these
+		self.variant_regs = {}
+		# Information in which contexts regs are used, to be used in
+		# debug options
+		self.usage_regs = collections.defaultdict(list)
+		self.bitsets = {}
+		self.enums = {}
+		self.variants = set()
+		self.file = []
+		self.xml_files = []
+		self.copyright_year = None
+		self.authors = []
+		self.license = None
+
+	def error(self, message):
+		parser, filename = self.stack[-1]
+		return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message))
+
+	def prefix(self, variant=None):
+		if self.current_prefix_type == "variant" and variant:
+			return variant
+		elif self.current_stripe:
+			return self.current_stripe + "_" + self.current_domain
+		elif self.current_prefix:
+			return self.current_prefix + "_" + self.current_domain
+		else:
+			return self.current_domain
+
+	def parse_field(self, name, attrs):
+		try:
+			if "pos" in attrs:
+				high = low = int(attrs["pos"], 0)
+			elif "high" in attrs and "low" in attrs:
+				high = int(attrs["high"], 0)
+				low = int(attrs["low"], 0)
+			else:
+				low = 0
+				high = self.current_bitsize - 1
+
+			if "type" in attrs:
+				type = attrs["type"]
+			else:
+				type = None
+
+			if "shr" in attrs:
+				shr = int(attrs["shr"], 0)
+			else:
+				shr = 0
+
+			b = Field(name, low, high, shr, type, self)
+
+			if type == "fixed" or type == "ufixed":
+				b.radix = int(attrs["radix"], 0)
+
+			self.current_bitset.fields.append(b)
+		except ValueError as e:
+			raise self.error(e)
+
+	def parse_varset(self, attrs):
+		# Inherit the varset from the enclosing domain if not overriden:
+		varset = self.current_varset
+		if "varset" in attrs:
+			varset = self.enums[attrs["varset"]]
+		return varset
+
+	def parse_variants(self, attrs):
+		if not "variants" in attrs:
+				return None
+		variant = attrs["variants"].split(",")[0]
+		if "-" in variant:
+			variant = variant[:variant.index("-")]
+
+		varset = self.parse_varset(attrs)
+
+		assert varset.has_name(variant)
+
+		return variant
+
+	def add_all_variants(self, reg, attrs, parent_variant):
+		# TODO this should really handle *all* variants, including dealing
+		# with open ended ranges (ie. "A2XX,A4XX-") (we have the varset
+		# enum now to make that possible)
+		variant = self.parse_variants(attrs)
+		if not variant:
+			variant = parent_variant
+
+		if reg.name not in self.variant_regs:
+			self.variant_regs[reg.name] = {}
+		else:
+			# All variants must be same size:
+			v = next(iter(self.variant_regs[reg.name]))
+			assert self.variant_regs[reg.name][v].bit_size == reg.bit_size
+
+		self.variant_regs[reg.name][variant] = reg
+
+	def add_all_usages(self, reg, usages):
+		if not usages:
+			return
+
+		for usage in usages:
+			self.usage_regs[usage].append(reg)
+
+		self.variants.add(reg.domain)
+
+	def do_validate(self, schemafile):
+		try:
+			from lxml import etree
+
+			parser, filename = self.stack[-1]
+			dirname = os.path.dirname(filename)
+
+			# we expect this to look like <namespace url> schema.xsd.. I think
+			# technically it is supposed to be just a URL, but that doesn't
+			# quite match up to what we do.. Just skip over everything up to
+			# and including the first whitespace character:
+			schemafile = schemafile[schemafile.rindex(" ")+1:]
+
+			# this is a bit cheezy, but the xml file to validate could be
+			# in a child director, ie. we don't really know where the schema
+			# file is, the way the rnn C code does.  So if it doesn't exist
+			# just look one level up
+			if not os.path.exists(dirname + "/" + schemafile):
+				schemafile = "../" + schemafile
+
+			if not os.path.exists(dirname + "/" + schemafile):
+				raise self.error("Cannot find schema for: " + filename)
+
+			xmlschema_doc = etree.parse(dirname + "/" + schemafile)
+			xmlschema = etree.XMLSchema(xmlschema_doc)
+
+			xml_doc = etree.parse(filename)
+			if not xmlschema.validate(xml_doc):
+				error_str = str(xmlschema.error_log.filter_from_errors()[0])
+				raise self.error("Schema validation failed for: " + filename + "\n" + error_str)
+		except ImportError:
+			print("lxml not found, skipping validation", file=sys.stderr)
+
+	def do_parse(self, filename):
+		filepath = os.path.abspath(filename)
+		if filepath in self.xml_files:
+			return
+		self.xml_files.append(filepath)
+		file = open(filename, "rb")
+		parser = xml.parsers.expat.ParserCreate()
+		self.stack.append((parser, filename))
+		parser.StartElementHandler = self.start_element
+		parser.EndElementHandler = self.end_element
+		parser.CharacterDataHandler = self.character_data
+		parser.buffer_text = True
+		parser.ParseFile(file)
+		self.stack.pop()
+		file.close()
+
+	def parse(self, rnn_path, filename):
+		self.path = rnn_path
+		self.stack = []
+		self.do_parse(filename)
+
+	def parse_reg(self, attrs, bit_size):
+		self.current_bitsize = bit_size
+		if "type" in attrs and attrs["type"] in self.bitsets:
+			bitset = self.bitsets[attrs["type"]]
+			if bitset.inline:
+				self.current_bitset = Bitset(attrs["name"], bitset)
+				self.current_bitset.inline = True
+			else:
+				self.current_bitset = bitset
+		else:
+			self.current_bitset = Bitset(attrs["name"], None)
+			self.current_bitset.inline = True
+			if "type" in attrs:
+				self.parse_field(None, attrs)
+
+		variant = self.parse_variants(attrs)
+		if not variant and self.current_array:
+			variant = self.current_array.variant
+
+		self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size)
+		self.current_reg.bitset = self.current_bitset
+
+		if len(self.stack) == 1:
+			self.file.append(self.current_reg)
+
+		if variant is not None:
+			self.add_all_variants(self.current_reg, attrs, variant)
+
+		usages = None
+		if "usage" in attrs:
+			usages = attrs["usage"].split(',')
+		elif self.current_array:
+			usages = self.current_array.usages
+
+		self.add_all_usages(self.current_reg, usages)
+
+	def start_element(self, name, attrs):
+		self.cdata = ""
+		if name == "import":
+			filename = attrs["file"]
+			self.do_parse(os.path.join(self.path, filename))
+		elif name == "domain":
+			self.current_domain = attrs["name"]
+			if "prefix" in attrs:
+				self.current_prefix = self.parse_variants(attrs)
+				self.current_prefix_type = attrs["prefix"]
+			else:
+				self.current_prefix = None
+				self.current_prefix_type = None
+			if "varset" in attrs:
+				self.current_varset = self.enums[attrs["varset"]]
+		elif name == "stripe":
+			self.current_stripe = self.parse_variants(attrs)
+		elif name == "enum":
+			self.current_enum_value = 0
+			self.current_enum = Enum(attrs["name"])
+			self.enums[attrs["name"]] = self.current_enum
+			if len(self.stack) == 1:
+				self.file.append(self.current_enum)
+		elif name == "value":
+			if "value" in attrs:
+				value = int(attrs["value"], 0)
+			else:
+				value = self.current_enum_value
+			self.current_enum.values.append((attrs["name"], value))
+		elif name == "reg32":
+			self.parse_reg(attrs, 32)
+		elif name == "reg64":
+			self.parse_reg(attrs, 64)
+		elif name == "array":
+			self.current_bitsize = 32
+			variant = self.parse_variants(attrs)
+			self.current_array = Array(attrs, self.prefix(variant), variant)
+			if len(self.stack) == 1:
+				self.file.append(self.current_array)
+		elif name == "bitset":
+			self.current_bitset = Bitset(attrs["name"], None)
+			if "inline" in attrs and attrs["inline"] == "yes":
+				self.current_bitset.inline = True
+			self.bitsets[self.current_bitset.name] = self.current_bitset
+			if len(self.stack) == 1 and not self.current_bitset.inline:
+				self.file.append(self.current_bitset)
+		elif name == "bitfield" and self.current_bitset:
+			self.parse_field(attrs["name"], attrs)
+		elif name == "database":
+			self.do_validate(attrs["xsi:schemaLocation"])
+		elif name == "copyright":
+			self.copyright_year = attrs["year"]
+		elif name == "author":
+			self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"])
+
+	def end_element(self, name):
+		if name == "domain":
+			self.current_domain = None
+			self.current_prefix = None
+			self.current_prefix_type = None
+		elif name == "stripe":
+			self.current_stripe = None
+		elif name == "bitset":
+			self.current_bitset = None
+		elif name == "reg32":
+			self.current_reg = None
+		elif name == "array":
+			self.current_array = None
+		elif name == "enum":
+			self.current_enum = None
+		elif name == "license":
+			self.license = self.cdata
+
+	def character_data(self, data):
+		self.cdata += data
+
+	def dump_reg_usages(self):
+		d = collections.defaultdict(list)
+		for usage, regs in self.usage_regs.items():
+			for reg in regs:
+				variants = self.variant_regs.get(reg.name)
+				if variants:
+					for variant, vreg in variants.items():
+						if reg == vreg:
+							d[(usage, variant)].append(reg)
+				else:
+					for variant in self.variants:
+						d[(usage, variant)].append(reg)
+
+		print("#ifdef __cplusplus")
+
+		for usage, regs in self.usage_regs.items():
+			print("template<chip CHIP> constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper()))
+
+		for (usage, variant), regs in d.items():
+			offsets = []
+
+			for reg in regs:
+				if reg.array:
+					for i in range(reg.array.length):
+						offsets.append(reg.array.offset + reg.offset + i * reg.array.stride)
+						if reg.bit_size == 64:
+							offsets.append(offsets[-1] + 1)
+				else:
+					offsets.append(reg.offset)
+					if reg.bit_size == 64:
+						offsets.append(offsets[-1] + 1)
+
+			offsets.sort()
+
+			print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant))
+			for offset in offsets:
+				print("\t%s," % hex(offset))
+			print("};")
+
+		print("#endif")
+
+	def dump(self):
+		enums = []
+		bitsets = []
+		regs = []
+		for e in self.file:
+			if isinstance(e, Enum):
+				enums.append(e)
+			elif isinstance(e, Bitset):
+				bitsets.append(e)
+			else:
+				regs.append(e)
+
+		for e in enums + bitsets + regs:
+			e.dump()
+
+		self.dump_reg_usages()
+
+
+	def dump_regs_py(self):
+		regs = []
+		for e in self.file:
+			if isinstance(e, Reg):
+				regs.append(e)
+
+		for e in regs:
+			e.dump_py()
+
+
+	def dump_reg_variants(self, regname, variants):
+		# Don't bother for things that only have a single variant:
+		if len(variants) == 1:
+			return
+		print("#ifdef __cplusplus")
+		print("struct __%s {" % regname)
+		# TODO be more clever.. we should probably figure out which
+		# fields have the same type in all variants (in which they
+		# appear) and stuff everything else in a variant specific
+		# sub-structure.
+		seen_fields = []
+		bit_size = 32
+		array = False
+		address = None
+		for variant in variants.keys():
+			print("    /* %s fields: */" % variant)
+			reg = variants[variant]
+			bit_size = reg.bit_size
+			array = reg.array
+			for f in reg.bitset.fields:
+				fld_name = field_name(reg, f)
+				if fld_name in seen_fields:
+					continue
+				seen_fields.append(fld_name)
+				name = fld_name.lower()
+				if f.type in [ "address", "waddress" ]:
+					if address:
+						continue
+					address = f
+					tab_to("    __bo_type", "bo;")
+					tab_to("    uint32_t", "bo_offset;")
+					continue
+				type, val = f.ctype("var")
+				tab_to("    %s" %type, "%s;" %name)
+		print("    /* fallback fields: */")
+		if bit_size == 64:
+			tab_to("    uint64_t", "unknown;")
+			tab_to("    uint64_t", "qword;")
+		else:
+			tab_to("    uint32_t", "unknown;")
+			tab_to("    uint32_t", "dword;")
+		print("};")
+		# TODO don't hardcode the varset enum name
+		varenum = "chip"
+		print("template <%s %s>" % (varenum, varenum.upper()))
+		print("static inline struct fd_reg_pair")
+		xtra = ""
+		xtravar = ""
+		if array:
+			xtra = "int __i, "
+			xtravar = "__i, "
+		print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname))
+		for variant in variants.keys():
+			print("  if (%s == %s) {" % (varenum.upper(), variant))
+			reg = variants[variant]
+			reg.dump_regpair_builder()
+			print("  } else")
+		print("    assert(!\"invalid variant\");")
+		print("}")
+
+		if bit_size == 64:
+			skip = ", { .reg = 0 }"
+		else:
+			skip = ""
+
+		print("#define %s(VARIANT, %s...) __%s<VARIANT>(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip))
+		print("#endif /* __cplusplus */")
+
+	def dump_structs(self):
+		for e in self.file:
+			e.dump_pack_struct()
+
+		for regname in self.variant_regs:
+			self.dump_reg_variants(regname, self.variant_regs[regname])
--- a/src/gallium/drivers/rocket/intercept.c
+++ b/src/gallium/drivers/rocket/intercept.c
@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "drm-uapi/rknpu_ioctl.h"
+#include "rkt_registers.h"
+
+// #define GETENV 1
+
+struct bo {
+   int handle;
+   unsigned size;
+   uint64_t obj_addr;
+   uint64_t dma_addr;
+};
+
+#define MAX_BOS 3000
+
+struct context {
+   int dump_file;
+   int device_fd;
+   struct bo bos[MAX_BOS];
+   unsigned next_handle_id;
+};
+
+struct context context = {0};
+
+static void
+dump_log(const char *format, ...)
+{
+   va_list args;
+   va_start(args, format);
+
+   int dump_fd = open("rknpu.log", O_CREAT | O_RDWR | O_APPEND,
+                      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+   vdprintf(dump_fd, format, args);
+   close(dump_fd);
+
+   va_end(args);
+}
+
+#define L(...) dump_log(__VA_ARGS__);
+
+static void *
+map_bo(struct bo *bo)
+{
+   struct rknpu_mem_map req = {0};
+
+   req.handle = bo->handle;
+   ioctl(context.device_fd, DRM_IOCTL_RKNPU_MEM_MAP, &req);
+   return mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               context.device_fd, req.offset);
+}
+
+static struct bo *
+find_bo(uint64_t dma_address, unsigned *offset)
+{
+   for (int j = 0; j < context.next_handle_id; j++) {
+      fprintf(stderr, "needle %lx hay %lx i %d\n", dma_address,
+              context.bos[j].dma_addr, j);
+      if (dma_address >= context.bos[j].dma_addr &&
+          dma_address < context.bos[j].dma_addr + context.bos[j].size) {
+         *offset = dma_address - context.bos[j].dma_addr;
+         return &context.bos[j];
+      }
+   }
+
+   return NULL;
+}
+
+static void
+dump_buffer(const char *name, uint64_t dma_address, unsigned size)
+{
+   unsigned offset = 0;
+   struct bo *bo = find_bo(dma_address, &offset);
+
+   fprintf(stderr, "dump_buffer name %s dma 0x%lx size %u bo %p\n", name,
+           dma_address, size, bo);
+
+   if (size == 0 || size + offset > bo->size)
+      size = bo->size - offset;
+
+   int fd = open(name, O_CREAT | O_RDWR | O_TRUNC,
+                 S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+   write(fd, map_bo(bo) + offset, size);
+   close(fd);
+}
+
+static unsigned task_id = 0;
+
+static int
+handle_submit(struct rknpu_submit *args, uint32_t *output_address)
+{
+   int ret = 0;
+
+   L("struct rknpu_submit submit = {\n");
+   L("   .flags = %x,\n", args->flags);
+   L("   .timeout = %d,\n", args->timeout);
+   L("   .task_start = %d,\n", args->task_start);
+   L("   .task_number = %d,\n", args->task_number);
+   L("   .task_counter = %d,\n", args->task_counter);
+   L("   .priority = %d,\n", args->priority);
+   L("   .task_obj_addr = 0x%llx,\n", args->task_obj_addr);
+   L("   .regcfg_obj_addr = 0x%llx,\n", args->regcfg_obj_addr);
+   L("   .task_base_addr = 0x%llx,\n", args->task_base_addr);
+   L("   .user_data = 0x%llx,\n", args->user_data);
+   L("   .core_mask = %x,\n", args->core_mask);
+   L("   .fence_fd = %d,\n", args->fence_fd);
+   L("   .subcore_task = {\n");
+   L("      {\n");
+   L("         .task_start = %d,\n", args->subcore_task[0].task_start);
+   L("         .task_number = %d,\n", args->subcore_task[0].task_number);
+   L("      },\n");
+   L("      {\n");
+   L("         .task_start = %d,\n", args->subcore_task[1].task_start);
+   L("         .task_number = %d,\n", args->subcore_task[1].task_number);
+   L("      },\n");
+   L("      {\n");
+   L("         .task_start = %d,\n", args->subcore_task[2].task_start);
+   L("         .task_number = %d,\n", args->subcore_task[2].task_number);
+   L("      },\n");
+   L("   },\n");
+   L("};\n");
+
+   struct bo *task_bo = NULL;
+   for (int i = 0; i < context.next_handle_id; i++) {
+      if (context.bos[i].obj_addr == args->task_obj_addr) {
+         task_bo = &context.bos[i];
+         break;
+      }
+   }
+
+   struct rknpu_task *tasks = map_bo(task_bo);
+   for (int i = args->task_start; i < args->task_start + args->task_number / 3;
+        i++) {
+      L("tasks[%d].flags = 0x%x;\n", i, tasks[i].flags);
+      L("tasks[%d].op_idx = %d;\n", i, tasks[i].op_idx);
+      L("tasks[%d].enable_mask = 0x%x;\n", i, tasks[i].enable_mask);
+      L("tasks[%d].int_mask = 0x%x;\n", i, tasks[i].int_mask);
+      L("tasks[%d].int_clear = 0x%x;\n", i, tasks[i].int_clear);
+      L("tasks[%d].regcfg_amount = %d;\n", i, tasks[i].regcfg_amount);
+      L("tasks[%d].regcfg_offset = 0x%x;\n", i, tasks[i].regcfg_offset);
+      L("tasks[%d].regcmd_addr = 0x%llx;\n", i, tasks[i].regcmd_addr);
+
+      if (tasks[i].regcmd_addr == 0x0)
+         continue;
+
+      char name[PATH_MAX];
+      unsigned size = (tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT) *
+                      sizeof(uint64_t);
+      sprintf(name, "regcmd%d.bin", task_id);
+      dump_buffer(name, tasks[i].regcmd_addr + tasks[i].regcfg_offset, size);
+
+      uint32_t input_address = 0x0;
+      *output_address = 0x0;
+      uint32_t weights_address = 0x0;
+      uint32_t biases_address = 0x0;
+      uint32_t eltwise_address = 0x0;
+
+      unsigned offset = 0;
+      struct bo *bo =
+         find_bo(tasks[i].regcmd_addr + tasks[i].regcfg_offset, &offset);
+      uint64_t *regcmd = map_bo(bo) + offset;
+      for (int j = 0; j < tasks[i].regcfg_amount + RKNPU_PC_DATA_EXTRA_AMOUNT;
+           j++) {
+         switch (regcmd[j] & 0xffff) {
+         case REG_CNA_FEATURE_DATA_ADDR:
+            input_address = (regcmd[j] & 0xffffffff0000) >> 16;
+            break;
+         case REG_CNA_DCOMP_ADDR0:
+            weights_address = (regcmd[j] & 0xffffffff0000) >> 16;
+            break;
+         case REG_DPU_DST_BASE_ADDR:
+            if (*output_address == 0x0)
+               *output_address = (regcmd[j] & 0xffffffff0000) >> 16;
+            break;
+         case REG_DPU_RDMA_RDMA_BS_BASE_ADDR:
+            biases_address = (regcmd[j] & 0xffffffff0000) >> 16;
+            break;
+         case REG_DPU_RDMA_RDMA_EW_BASE_ADDR:
+            eltwise_address = (regcmd[j] & 0xffffffff0000) >> 16;
+            break;
+         }
+      }
+
+      fprintf(stderr, "weights_address %x\n", weights_address);
+      fprintf(stderr, "input_address %x\n", input_address);
+      fprintf(stderr, "output_address %x\n", *output_address);
+      fprintf(stderr, "biases_address %x\n", biases_address);
+      fprintf(stderr, "eltwise_address %x\n", eltwise_address);
+
+      if (weights_address != 0x0) {
+         sprintf(name, "weights%d.bin", task_id);
+         dump_buffer(name, weights_address, 0);
+      }
+
+      if (biases_address != 0x0) {
+         sprintf(name, "biases%d.bin", task_id);
+         dump_buffer(name, biases_address, 0);
+      }
+
+      if (eltwise_address != 0x0) {
+         sprintf(name, "eltwise%d.bin", task_id);
+         dump_buffer(name, eltwise_address, 0);
+      }
+
+      if (input_address != 0x0) {
+         sprintf(name, "input%d.bin", task_id);
+         dump_buffer(name, input_address, 0);
+      }
+
+      task_id++;
+   }
+
+   return ret;
+}
+
+static void
+handle_mem_sync(struct rknpu_mem_sync *args)
+{
+   L("struct rknpu_mem_sync sync = {\n");
+   L("   .flags = 0x%x,\n", args->flags);
+   L("   .reserved = 0x%x,\n", args->reserved);
+   L("   .obj_addr = 0x%llx,\n", args->obj_addr);
+   L("   .offset = 0x%llx,\n", args->offset);
+   L("   .size = %llu,\n", args->size);
+   L("};\n");
+}
+
+static int
+handle_mem_create(struct rknpu_mem_create *args)
+{
+   int ret = 0;
+
+#if 0
+   L("struct rknpu_mem_create create = {\n");
+   L("   .dma_addr = 0x%llx,\n", args->dma_addr);
+   L("   .flags = 0x%x,\n", args->flags);
+   L("   .handle = %u,\n", args->handle);
+   L("   .obj_addr = 0x%llx,\n", args->obj_addr);
+   L("   .size = %llu,\n", args->size);
+   L("};\n");
+#endif
+
+   assert(context.next_handle_id < MAX_BOS);
+
+   context.bos[context.next_handle_id].handle = args->handle;
+   context.bos[context.next_handle_id].size = args->size;
+   context.bos[context.next_handle_id].obj_addr = args->obj_addr;
+   context.bos[context.next_handle_id].dma_addr = args->dma_addr;
+
+   fprintf(stderr, "%s: dma_addr %llx\n", __func__, args->dma_addr);
+   context.next_handle_id++;
+
+   return ret;
+}
+
+static void
+handle_action(struct rknpu_action *args)
+{
+   switch (args->flags) {
+   case RKNPU_GET_HW_VERSION:
+      L("%s: RKNPU_GET_HW_VERSION %x\n", __func__, args->value);
+      break;
+   case RKNPU_GET_DRV_VERSION:
+      L("%s: RKNPU_GET_DRV_VERSION %x\n", __func__, args->value);
+      break;
+   case RKNPU_POWER_ON:
+      L("%s: RKNPU_POWER_ON %x\n", __func__, args->value);
+      break;
+   case RKNPU_GET_IOMMU_EN:
+      L("%s: RKNPU_GET_IOMMU_EN %x\n", __func__, args->value);
+      break;
+   case RKNPU_SET_PROC_NICE:
+      L("%s: RKNPU_SET_PROC_NICE %x\n", __func__, args->value);
+      break;
+   case RKNPU_GET_FREQ:
+      L("%s: RKNPU_GET_FREQ %x\n", __func__, args->value);
+      break;
+   default:
+      L("%s: unhandled action %d %x\n", __func__, args->flags, args->value);
+      break;
+   }
+}
+
+typedef int (*real_ioctl_t)(int fd, unsigned long request, ...);
+int
+ioctl(int fd, unsigned long request, ...)
+{
+   int ret;
+   uint32_t output_address = 0;
+
+   va_list ap;
+   va_start(ap, request);
+   void *ptr_ = va_arg(ap, void *);
+   va_end(ap);
+
+   real_ioctl_t real_ioctl;
+   real_ioctl = (real_ioctl_t)dlsym(RTLD_NEXT, "ioctl");
+
+   switch (request) {
+   case DRM_IOCTL_RKNPU_SUBMIT:
+      handle_submit(ptr_, &output_address);
+      break;
+   case DRM_IOCTL_RKNPU_MEM_SYNC:
+      // handle_mem_sync(ptr_);
+      break;
+   case DRM_IOCTL_RKNPU_ACTION:
+      // handle_action(ptr_);
+      break;
+   }
+
+   ret = real_ioctl(fd, request, ptr_);
+
+   switch (request) {
+   case DRM_IOCTL_RKNPU_SUBMIT: {
+      char name[PATH_MAX];
+      sprintf(name, "output%d.bin", task_id);
+      dump_buffer(name, output_address, 0);
+
+      break;
+   }
+   case DRM_IOCTL_RKNPU_MEM_CREATE:
+   case IOCTL_RKNPU_MEM_CREATE:
+   case 0xc0286442:
+      handle_mem_create(ptr_);
+      context.device_fd = fd;
+      break;
+   }
+
+   return ret;
+}
+
+/* Intended to be called from GDB when the underlying memory is not directly
+ * accessible to it. */
+void dump_mem(uint32_t *ptr, unsigned bytes);
+
+void
+dump_mem(uint32_t *ptr, unsigned bytes)
+{
+   for (int i = 0; i < bytes / 4; i++) {
+      fprintf(stderr, "%08x %08x %08x %08x\n", ptr[0], ptr[1], ptr[2], ptr[3]);
+      ptr += 4;
+   }
+}
+
+#ifdef GETENV
+typedef char *(*real_getenv_t)(const char *name);
+char *
+getenv(const char *name)
+{
+   real_getenv_t real_getenv;
+   real_getenv = (real_getenv_t)dlsym(RTLD_NEXT, "getenv");
+
+   fprintf(stderr, "getenv %s\n", name);
+
+   return real_getenv(name);
+}
+
+#endif
--- a/src/gallium/drivers/rocket/meson.build
+++ b/src/gallium/drivers/rocket/meson.build
@ -0,0 +1,38 @@
+# Copyright 2019 Google, Inc
+# SPDX-License-Identifier: MIT
+
+rocket_registers = custom_target(
+  'rkt_registers.h',
+  input : ['gen_header.py', 'registers.xml'],
+  output : 'rkt_registers.h',
+  command : [prog_python, '@INPUT0@', '--rnn', '.', '--xml', '@INPUT1@', 'c-defines'],
+  capture : true,
+)
+
+files_rocket = files(
+  'rkt_coefs.c',
+  'rkt_device.c',
+  'rkt_ml.c',
+  'rkt_regcmd.c',
+  'rkt_task.c',
+)
+
+librocket = static_library(
+  'rocket',
+  [files_rocket, rocket_registers],
+  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [idep_mesautil, dep_libdrm],
+)
+
+driver_rocket = declare_dependency(
+  compile_args : '-DGALLIUM_ROCKET',
+  link_with : [librocketwinsys, librocket]
+)
+
+shared_library('intercept',
+   [files('intercept.c'), rocket_registers],
+   include_directories : [inc_include],
+   dependencies : [dep_libdrm],
+   c_args: ['-Wno-error=missing-prototypes', '-g', '-O0']
+)
--- a/src/gallium/drivers/rocket/registers.xml
+++ b/src/gallium/drivers/rocket/registers.xml
--- a/src/gallium/drivers/rocket/rkt_coefs.c
+++ b/src/gallium/drivers/rocket/rkt_coefs.c
@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/u_inlines.h"
+
+#include "rkt_coefs.h"
+#include "rkt_ml.h"
+
+struct pipe_resource *
+rkt_fill_weights(struct rkt_ml_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation)
+{
+   struct pipe_context *pcontext = subgraph->base.context;
+   unsigned weights_width = poperation->conv.weight_tensor->dims[1];
+   unsigned weights_height = poperation->conv.weight_tensor->dims[2];
+   unsigned input_channels = poperation->input_tensors[0]->dims[3];
+   unsigned input_channels_real = poperation->input_tensors[0]->dims[3];
+   unsigned output_channels = poperation->output_tensors[0]->dims[3];
+   unsigned output_channels_real = poperation->output_tensors[0]->dims[3];
+   unsigned weights_size;
+   uint8_t zero_point = poperation->conv.weight_tensor->zero_point;
+   struct pipe_transfer *transfer_in, *transfer_out;
+   void *map =
+      pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource,
+                      PIPE_MAP_READ, &transfer_in);
+   uint8_t(*weights_in)[weights_width][weights_height][input_channels] = map;
+   struct pipe_resource *rsc;
+   uint8_t *weights_out;
+
+   input_channels = MAX2(input_channels, FEATURE_ATOMIC_SIZE);
+
+   output_channels = ALIGN(output_channels, 2);
+   if (rkt_is_depthwise(poperation))
+      output_channels = 1;
+
+   weights_size = weights_width * weights_height * output_channels *
+                  ALIGN(input_channels, WEIGHT_ATOMIC_SIZE) * 2;
+
+   rsc =
+      pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT, weights_size);
+   weights_out = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out);
+
+   unsigned input_channel_groups = WEIGHT_ATOMIC_SIZE;
+   if (rkt_is_depthwise(poperation))
+      input_channel_groups *= 2;
+
+   unsigned input_channels_1 =
+      DIV_ROUND_UP(input_channels, input_channel_groups);
+   unsigned input_channels_2 = MIN2(input_channels, input_channel_groups);
+
+   unsigned n = 0;
+   for (int oc1 = 0; oc1 < DIV_ROUND_UP(output_channels, WEIGHT_ATOMIC_SIZE);
+        oc1++) {
+      for (int ic1 = 0; ic1 < input_channels_1; ic1++) {
+         for (int x = 0; x < weights_width; x++) {
+            for (int y = 0; y < weights_height; y++) {
+               for (int oc2 = 0; oc2 < MIN2(output_channels, WEIGHT_ATOMIC_SIZE);
+                    oc2++) {
+                  for (int ic2 = 0; ic2 < input_channels_2; ic2++) {
+                     unsigned oc = oc1 * WEIGHT_ATOMIC_SIZE + oc2;
+                     unsigned ic = ic1 * input_channel_groups + ic2;
+                     if (output_channels_real > 2 &&
+                         oc >= ALIGN(output_channels_real, 2))
+                        continue;
+
+                     if (oc >= output_channels_real)
+                        weights_out[n++] = 0x0;
+                     else if (ic >= input_channels_real) {
+                        if (ic2 < 16 || (input_channels_real % 32) > 16)
+                           weights_out[n++] =
+                              zero_point - 0x80; /* TODO: Why is the blob converting to
+                                                    signed? It should be unsigned. */
+                     } else
+                        weights_out[n++] = weights_in[oc][x][y][ic] -
+                                           0x80; /* TODO: Why is the blob converting to
+                                                    signed? It should be unsigned. */
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) {
+      static int task = 0;
+      rkt_dump_buffer(weights_out, "weights", 0, task++, 0, weights_size);
+   }
+
+   pipe_buffer_unmap(pcontext, transfer_out);
+
+   pipe_buffer_unmap(pcontext, transfer_in);
+
+   return rsc;
+}
+
+static int32_t
+calculate_bias_correction(struct rkt_ml_subgraph *subgraph,
+                          const struct pipe_ml_operation *poperation,
+                          unsigned oc, void *map)
+{
+   unsigned input_channels = poperation->input_tensors[0]->dims[3];
+   unsigned input_zero_point = poperation->input_tensors[0]->zero_point;
+   unsigned weights_width = poperation->conv.weight_tensor->dims[1];
+   unsigned weights_height = poperation->conv.weight_tensor->dims[2];
+   unsigned weight_zero_point = poperation->conv.weight_tensor->zero_point;
+   uint8_t(*weights)[weights_width][weights_height][input_channels] = map;
+
+   int32_t correction = 0;
+   if (rkt_is_depthwise(poperation)) {
+      for (unsigned x = 0; x < weights_width; x++) {
+         for (unsigned y = 0; y < weights_height; y++) {
+            correction += (weights[0][x][y][oc] - weight_zero_point) *
+                          (input_zero_point - 0x80);
+         }
+      }
+   } else {
+      for (unsigned x = 0; x < weights_width; x++) {
+         for (unsigned y = 0; y < weights_height; y++) {
+            for (unsigned ic = 0; ic < input_channels; ic++) {
+               correction += (weights[oc][x][y][ic] - weight_zero_point) *
+                             (input_zero_point - 0x80);
+            }
+         }
+      }
+   }
+
+   return correction;
+}
+
+struct pipe_resource *
+rkt_fill_biases(struct rkt_ml_subgraph *subgraph,
+                const struct pipe_ml_operation *poperation,
+                unsigned *truncate_bits)
+{
+   struct pipe_context *pcontext = subgraph->base.context;
+   unsigned output_channels = poperation->output_tensors[0]->dims[3];
+   unsigned weights_size = poperation->conv.weight_tensor->dims[1];
+   struct pipe_transfer *transfer_in, *transfer_out, *transfer_weights;
+   int32_t *biases_in =
+      pipe_buffer_map(pcontext, poperation->conv.bias_tensor->resource,
+                      PIPE_MAP_READ, &transfer_in);
+   void *weights =
+      pipe_buffer_map(pcontext, poperation->conv.weight_tensor->resource,
+                      PIPE_MAP_READ, &transfer_weights);
+   struct pipe_resource *rsc;
+   uint32_t *biases;
+
+   rsc = pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT,
+                            output_channels * sizeof(uint32_t));
+   biases = pipe_buffer_map(pcontext, rsc, PIPE_MAP_WRITE, &transfer_out);
+
+   // DBG("weight_scale %x\n",
+   // fui(poperation->conv.weight_tensor->scale));
+   /* TODO: Figure out when exactly we need to truncate */
+   /* From
+    * http://nvdla.org/hw/v1/ias/unit_description.html#convolution-accumulator :
+    *
+    * The final result of accumulator in CACC is 48bits for INT16 and 34bits for
+    * INT8. The bit width between CACC and SDP is 32. For precisions INT8 and
+    * INT16, there is a round and saturation operation before sending the result
+    * to SDP. The precision of rounding is configured by field CLIP_TRUNCATE in
+    * register D_CLIP_CFG. For FP16, the value is just converted from FP48 to
+    * FP32.
+    */
+   if (fui(poperation->conv.weight_tensor->scale) == 0x3a88323f ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c0060de ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c06022d ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c1642e3 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c1e3f51 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c5c8aa8 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c615e93 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c7326a2 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3c783013 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3d1748e6 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3d282992 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3d2e87ae ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3d77f5f6 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3a9a5956 ||
+       fui(poperation->conv.weight_tensor->scale) == 0x3caebc56)
+      *truncate_bits = 1;
+   else
+      *truncate_bits = 0;
+
+   int32_t max_bias = 0;
+   int32_t max_corr = 0;
+   unsigned max_num_bits = 0;
+   bool retry = true;
+   while (retry) {
+      for (int oc = 0; oc < output_channels; oc++) {
+         int32_t corr =
+            calculate_bias_correction(subgraph, poperation, oc, weights);
+         biases[oc] = (biases_in[oc] - corr) / (1 << *truncate_bits);
+
+         int64_t max_val =
+            (biases_in[oc] - corr + 255 * 255 * weights_size * weights_size) /
+            (1 << *truncate_bits);
+         unsigned num_bits = ceil(log(abs((int32_t)max_val)) / log(2)) + 1;
+         max_bias = MAX2(max_bias, biases[oc]);
+         max_corr = MAX2(max_corr, corr);
+         max_num_bits = MAX2(max_num_bits, num_bits);
+
+         /* TODO: This doesn't actually work, num_bits doesn't go above 19, and the
+          * blob sometimes truncates way below */
+         if (num_bits > 32) {
+            (*truncate_bits)++;
+            retry = true;
+         } else
+            retry = false;
+      }
+   }
+
+   if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS)) {
+      static int task = 0;
+      rkt_dump_buffer((uint8_t *)biases, "biases", 0, task++, 0,
+                      output_channels * sizeof(uint32_t));
+   }
+
+   pipe_buffer_unmap(pcontext, transfer_out);
+
+   pipe_buffer_unmap(pcontext, transfer_weights);
+
+   pipe_buffer_unmap(pcontext, transfer_in);
+
+   return rsc;
+}
--- a/src/gallium/drivers/rocket/rkt_coefs.h
+++ b/src/gallium/drivers/rocket/rkt_coefs.h
@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_COEFS_H
+#define RKT_COEFS_H
+
+#include "rkt_ml.h"
+
+struct pipe_resource *
+rkt_fill_weights(struct rkt_ml_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation);
+
+struct pipe_resource *
+rkt_fill_biases(struct rkt_ml_subgraph *subgraph,
+                const struct pipe_ml_operation *poperation,
+                unsigned *truncate_bits);
+
+#endif /* RKT_COEFS_H */
--- a/src/gallium/drivers/rocket/rkt_device.c
+++ b/src/gallium/drivers/rocket/rkt_device.c
@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_device.h"
+#include "rkt_ml.h"
+
+#include "drm-uapi/rocket_accel.h"
+
+#include <xf86drm.h>
+#include "util/os_mman.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+
+static const struct debug_named_value rocket_debug_options[] = {
+   {"dbg_msgs", ROCKET_DBG_MSGS, "Print debug messages"},
+   {"dump_bos", ROCKET_DBG_DUMP_BOS, "Dump buffers for analysis"},
+   {"zero_bos", ROCKET_DBG_ZERO, "Zero buffers for debugging"},
+   DEBUG_NAMED_VALUE_END};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(rocket_debug, "ROCKET_DEBUG", rocket_debug_options, 0)
+int rocket_debug = 0;
+
+static void
+rkt_destroy_screen(struct pipe_screen *pscreen)
+{
+   struct rkt_screen *screen = rkt_screen(pscreen);
+
+   if (screen->ro)
+      screen->ro->destroy(screen->ro);
+
+   ralloc_free(screen);
+}
+
+static void
+rkt_destroy_context(struct pipe_context *pctx)
+{
+   struct rkt_context *ctx = rkt_context(pctx);
+
+   ralloc_free(ctx);
+}
+
+static void *
+rkt_buffer_map(struct pipe_context *pctx,
+               struct pipe_resource *prsc, unsigned level,
+               unsigned usage, const struct pipe_box *box,
+               struct pipe_transfer **out_transfer)
+{
+   struct rkt_screen *screen = rkt_screen(pctx->screen);
+   struct rkt_resource *rsc = rkt_resource(prsc);
+   struct drm_rocket_prep_bo arg = {0};
+   int ret;
+
+   assert(level == 0);
+   assert(prsc->target == PIPE_BUFFER);
+   assert(box->y == 0);
+   assert(box->z == 0);
+   assert(box->height == 1);
+   assert(box->depth == 1);
+
+   struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
+   transfer->level = level;
+   transfer->usage = usage;
+   transfer->box = *box;
+
+   pipe_resource_reference(&transfer->resource, prsc);
+
+   arg.handle = rsc->handle;
+   arg.timeout_ns = INT64_MAX;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_PREP_BO, &arg);
+   assert(ret != -1);
+
+   uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          screen->fd, rsc->fake_offset);
+   assert(map != MAP_FAILED);
+
+   *out_transfer = transfer;
+
+   return map + box->x;
+}
+
+static void
+rkt_buffer_unmap(struct pipe_context *pctx,
+                 struct pipe_transfer *transfer)
+{
+   struct rkt_screen *screen = rkt_screen(pctx->screen);
+   struct rkt_resource *rsrc = rkt_resource(transfer->resource);
+   struct drm_rocket_fini_bo arg = {0};
+   int ret;
+
+   arg.handle = rsrc->handle;
+
+   if (transfer->usage == PIPE_MAP_WRITE) {
+      ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_FINI_BO, &arg);
+      assert(ret >= 0);
+   }
+
+   pipe_resource_reference(&transfer->resource, NULL);
+   ralloc_free(transfer);
+}
+
+static struct pipe_context *
+rkt_create_context(struct pipe_screen *screen,
+                   void *priv, unsigned flags)
+{
+   struct rkt_context *ctx = rzalloc(NULL, struct rkt_context);
+   struct pipe_context *pctx = &ctx->base;
+
+   if (!ctx)
+      return NULL;
+
+   pctx->screen = screen;
+   pctx->priv = priv;
+
+   pctx->destroy = rkt_destroy_context;
+
+   pctx->buffer_map = rkt_buffer_map;
+   pctx->buffer_unmap = rkt_buffer_unmap;
+   pctx->resource_copy_region = util_resource_copy_region;
+   pctx->buffer_subdata = u_default_buffer_subdata;
+   pctx->clear_buffer = u_default_clear_buffer;
+
+   pctx->ml_operation_supported = rkt_ml_operation_supported;
+   pctx->ml_subgraph_create = rkt_ml_subgraph_create;
+   pctx->ml_subgraph_invoke = rkt_ml_subgraph_invoke;
+   pctx->ml_subgraph_read_output = rkt_ml_subgraph_read_outputs;
+   pctx->ml_subgraph_destroy = rkt_ml_subgraph_destroy;
+
+   return pctx;
+}
+
+static struct pipe_resource *
+rkt_resource_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *templat)
+{
+   struct rkt_screen *screen = rkt_screen(pscreen);
+   struct drm_rocket_create_bo arg = {0};
+   struct rkt_resource *rsc;
+   int ret;
+
+   assert(templat->target == PIPE_BUFFER);
+   assert(templat->height0 == 1);
+   assert(templat->depth0 == 1);
+   assert(templat->array_size == 1);
+
+   rsc = rzalloc(NULL, struct rkt_resource);
+   if (!rsc)
+      return NULL;
+
+   rsc->base = *templat;
+   rsc->base.screen = pscreen;
+   rsc->base.nr_samples = templat->nr_samples;
+   pipe_reference_init(&rsc->base.reference, 1);
+
+   rsc->bo_size = templat->width0;
+
+   arg.size = templat->width0;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_CREATE_BO, &arg);
+   if (ret < 0)
+      goto free_rsc;
+
+   rsc->handle = arg.handle;
+   rsc->phys_addr = arg.dma_address;
+   rsc->fake_offset = arg.offset;
+
+   if (DBG_ENABLED(ROCKET_DBG_ZERO)) {
+      void *map = os_mmap(NULL, arg.size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          screen->fd, rsc->fake_offset);
+      memset(map, 0, arg.size);
+   }
+
+   return &rsc->base;
+
+free_rsc:
+   ralloc_free(rsc);
+   return NULL;
+}
+
+static void
+rkt_resource_destroy(struct pipe_screen *pscreen,
+                     struct pipe_resource *prsc)
+{
+   struct rkt_resource *rsc = rkt_resource(prsc);
+   struct rkt_screen *screen = rkt_screen(pscreen);
+   struct drm_gem_close arg = {0};
+   int ret;
+
+   arg.handle = rsc->handle;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
+   assert(ret >= 0);
+
+   ralloc_free(rsc);
+}
+
+static int
+rkt_screen_get_fd(struct pipe_screen *pscreen)
+{
+   return rkt_screen(pscreen)->fd;
+}
+
+struct pipe_screen *
+rkt_screen_create(int fd,
+                  const struct pipe_screen_config *config,
+                  struct renderonly *ro)
+{
+   struct rkt_screen *rkt_screen;
+   struct pipe_screen *screen;
+
+   rkt_screen = rzalloc(NULL, struct rkt_screen);
+   if (!rkt_screen)
+      return NULL;
+
+   screen = &rkt_screen->pscreen;
+
+   rocket_debug = debug_get_option_rocket_debug();
+
+   rkt_screen->fd = fd;
+
+   screen->get_screen_fd = rkt_screen_get_fd;
+   screen->destroy = rkt_destroy_screen;
+   screen->context_create = rkt_create_context;
+   screen->resource_create = rkt_resource_create;
+   screen->resource_destroy = rkt_resource_destroy;
+
+   return screen;
+}
--- a/src/gallium/drivers/rocket/rkt_device.h
+++ b/src/gallium/drivers/rocket/rkt_device.h
@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "renderonly/renderonly.h"
+#include "util/log.h"
+
+#ifndef RKT_SCREEN_H
+#define RKT_SCREEN_H
+
+enum rkt_dbg {
+   ROCKET_DBG_MSGS = BITFIELD_BIT(0),
+   ROCKET_DBG_DUMP_BOS = BITFIELD_BIT(1),
+   ROCKET_DBG_ZERO = BITFIELD_BIT(2),
+};
+
+extern int rocket_debug;
+
+#define DBG_ENABLED(flag) unlikely(rocket_debug &(flag))
+
+#define DBG(fmt, ...)                                 \
+   do {                                               \
+      if (DBG_ENABLED(ROCKET_DBG_MSGS))               \
+         mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
+                   ##__VA_ARGS__);                    \
+   } while (0)
+
+struct rkt_screen {
+   struct pipe_screen pscreen;
+
+   int fd;
+   struct renderonly *ro;
+};
+
+static inline struct rkt_screen *
+rkt_screen(struct pipe_screen *p)
+{
+   return (struct rkt_screen *)p;
+}
+
+struct rkt_context {
+   struct pipe_context base;
+};
+
+static inline struct rkt_context *
+rkt_context(struct pipe_context *pctx)
+{
+   return (struct rkt_context *)pctx;
+}
+
+struct rkt_resource {
+   struct pipe_resource base;
+
+   uint32_t handle;
+   uint64_t phys_addr;
+   uint64_t obj_addr;
+   uint64_t fake_offset;
+   uint64_t bo_size;
+};
+
+static inline struct rkt_resource *
+rkt_resource(struct pipe_resource *p)
+{
+   return (struct rkt_resource *)p;
+}
+
+struct pipe_screen *rkt_screen_create(int fd,
+                                      const struct pipe_screen_config *config,
+                                      struct renderonly *ro);
+
+#endif /* RKT_SCREEN_H */
--- a/src/gallium/drivers/rocket/rkt_ml.c
+++ b/src/gallium/drivers/rocket/rkt_ml.c
@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "util/u_inlines.h"
+
+#include <xf86drm.h>
+
+#include "drm-uapi/rocket_accel.h"
+
+#include "rkt_coefs.h"
+#include "rkt_ml.h"
+#include "rkt_regcmd.h"
+#include "rkt_task.h"
+
+void
+rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                int suboperation_nr, int offset, unsigned size)
+{
+   char buffer[255];
+
+   snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
+            suboperation_nr);
+
+   FILE *f = fopen(buffer, "wb");
+   assert(f);
+   fwrite(ptr + offset, 1, size, f);
+   if (ferror(f)) {
+      DBG("Error in writing to file: %s\n", strerror(errno));
+   }
+   fflush(f);
+   fclose(f);
+}
+
+static void
+create_tensor(struct rkt_ml_subgraph *subgraph, unsigned idx,
+              unsigned size)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct pipe_resource **tensors = util_dynarray_begin(&subgraph->tensors);
+
+   assert(idx < util_dynarray_num_elements(&subgraph->tensors,
+                                           struct pipe_resource *));
+
+   struct pipe_resource *res = tensors[idx];
+
+   if (res != NULL) {
+      assert(size == pipe_buffer_size(res));
+      return;
+   }
+
+   res = pipe_buffer_create(context->screen, 0, PIPE_USAGE_DEFAULT, size);
+   tensors[idx] = res;
+}
+
+struct rkt_resource *
+rkt_get_tensor(struct rkt_ml_subgraph *subgraph,
+               unsigned idx)
+{
+   return rkt_resource(
+      *util_dynarray_element(&subgraph->tensors, struct pipe_resource *, idx));
+}
+
+bool
+rkt_is_depthwise(const struct pipe_ml_operation *poperation)
+{
+   unsigned input_channels = poperation->input_tensors[0]->dims[3];
+   unsigned output_channels = poperation->output_tensors[0]->dims[3];
+
+   return poperation->conv.depthwise && input_channels > 1 &&
+          output_channels > 1;
+}
+
+static unsigned
+calc_raw_output_size(struct rkt_operation *operation)
+{
+   unsigned output_channels_1 =
+      DIV_ROUND_UP(operation->output_channels, FEATURE_ATOMIC_SIZE) * 2;
+   unsigned output_channels_2 = FEATURE_ATOMIC_SIZE;
+
+   return operation->output_width * operation->output_height *
+          output_channels_1 * output_channels_2;
+}
+
+static void
+compile_operation(struct rkt_ml_subgraph *subgraph,
+                  struct rkt_operation *operation)
+{
+   struct pipe_context *pcontext = subgraph->base.context;
+   unsigned regcfg_total_size = 0;
+   struct util_dynarray *regcfgs;
+   struct pipe_transfer *transfer = NULL;
+   unsigned num_tasks =
+      util_dynarray_num_elements(&operation->tasks, struct split_task);
+
+   regcfgs = calloc(num_tasks, sizeof(struct util_dynarray));
+
+   for (int i = 0; i < num_tasks; i++) {
+      util_dynarray_init(&regcfgs[i], NULL);
+      rkt_fill_regcmd(subgraph, operation, &regcfgs[i], i);
+
+      unsigned size =
+         util_dynarray_num_elements(&regcfgs[i], uint64_t) * sizeof(uint64_t);
+      regcfg_total_size += ALIGN(size, 64);
+   }
+
+   operation->regcmd = pipe_buffer_create(pcontext->screen, 0,
+                                          PIPE_USAGE_DEFAULT, regcfg_total_size);
+   uint8_t *regcmd =
+      pipe_buffer_map(pcontext, operation->regcmd, PIPE_MAP_WRITE, &transfer);
+
+   unsigned regcmd_offset = 0;
+   for (int i = 0; i < num_tasks; i++) {
+      unsigned size = util_dynarray_num_elements(&regcfgs[i], uint64_t);
+      struct split_task *task =
+         util_dynarray_element(&operation->tasks, struct split_task, i);
+
+      if (i < num_tasks - 1) {
+         /* Patch next address and amount of regs to fetch, positions are relative
+          * to end */
+         unsigned reg_count = util_dynarray_num_elements(&regcfgs[i], uint64_t);
+         uint64_t *next_address_reg =
+            util_dynarray_element(&regcfgs[i], uint64_t, reg_count - 4);
+         uint64_t *reg_count_reg =
+            util_dynarray_element(&regcfgs[i], uint64_t, reg_count - 3);
+
+         uint64_t addr = rkt_resource(operation->regcmd)->phys_addr +
+                         regcmd_offset + ALIGN(size * sizeof(uint64_t), 64);
+         *next_address_reg |= addr << 16;
+
+         unsigned regs_to_fetch =
+            util_dynarray_num_elements(&regcfgs[i + 1], uint64_t);
+         regs_to_fetch -= 4;
+         regs_to_fetch = ALIGN(regs_to_fetch / 2, 2);
+         *reg_count_reg |= regs_to_fetch << 16;
+      }
+
+      memcpy(regcmd + regcmd_offset, util_dynarray_begin(&regcfgs[i]),
+             size * sizeof(uint64_t));
+      util_dynarray_fini(&regcfgs[i]);
+
+      task->regcfg_amount = size;
+      task->regcfg_addr =
+         rkt_resource(operation->regcmd)->phys_addr + regcmd_offset;
+
+      if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+         rkt_dump_buffer(regcmd, "regcmd", 0, i, regcmd_offset,
+                         (size + 4) * sizeof(uint64_t));
+
+      regcmd_offset += ALIGN(size * sizeof(uint64_t), 64);
+   }
+
+   pipe_buffer_unmap(pcontext, transfer);
+
+   for (int i = 0; i < num_tasks; i++) {
+      util_dynarray_fini(&regcfgs[i]);
+   }
+
+   free(regcfgs);
+}
+
+static void
+lower_convolution(struct rkt_ml_subgraph *subgraph,
+                  const struct pipe_ml_operation *poperation,
+                  struct rkt_operation *operation)
+{
+   util_dynarray_init(&operation->tasks, NULL);
+
+   operation->depthwise = rkt_is_depthwise(poperation);
+   operation->padding_same = poperation->conv.padding_same;
+   operation->stride = poperation->conv.stride_x;
+
+   operation->input_index = poperation->input_tensors[0]->index;
+   operation->input_width = poperation->input_tensors[0]->dims[1];
+   operation->input_height = poperation->input_tensors[0]->dims[2];
+   operation->input_channels = poperation->input_tensors[0]->dims[3];
+   operation->input_zero_point = poperation->input_tensors[0]->zero_point;
+   operation->input_scale = poperation->input_tensors[0]->scale;
+
+   operation->output_index = poperation->output_tensors[0]->index;
+   operation->output_width = poperation->output_tensors[0]->dims[1];
+   operation->output_height = poperation->output_tensors[0]->dims[2];
+   operation->output_channels = poperation->output_tensors[0]->dims[3];
+   operation->output_zero_point = poperation->output_tensors[0]->zero_point;
+   operation->output_scale = poperation->output_tensors[0]->scale;
+
+   operation->weights_width = poperation->conv.weight_tensor->dims[1];
+   operation->weights_height = poperation->conv.weight_tensor->dims[2];
+   operation->weights_zero_point = poperation->conv.weight_tensor->zero_point;
+   operation->weights_scale = poperation->conv.weight_tensor->scale;
+
+   operation->weights = rkt_fill_weights(subgraph, poperation);
+   operation->biases =
+      rkt_fill_biases(subgraph, poperation, &operation->truncate_bits);
+}
+
+static struct rkt_operation *
+find_first_consumer(struct rkt_ml_subgraph *subgraph, unsigned tensor_index)
+{
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      if (operation->input_index == tensor_index)
+         return operation;
+   }
+
+   return NULL;
+}
+
+static struct rkt_operation *
+find_producer(struct rkt_ml_subgraph *subgraph,
+              unsigned tensor_index)
+{
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      if (operation->output_index == tensor_index)
+         return operation;
+   }
+
+   return NULL;
+}
+
+static unsigned
+count_tensors(const struct pipe_ml_operation *poperations,
+              unsigned count)
+{
+   unsigned tensor_count = 0;
+
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+      tensor_count = MAX2(tensor_count, poperation->input_tensors[0]->index);
+      tensor_count = MAX2(tensor_count, poperation->output_tensors[0]->index);
+      switch (poperation->type) {
+      case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
+         tensor_count = MAX2(tensor_count, poperation->conv.weight_tensor->index);
+         tensor_count = MAX2(tensor_count, poperation->conv.bias_tensor->index);
+         break;
+      case PIPE_ML_OPERATION_TYPE_ADD:
+         tensor_count = MAX2(tensor_count, poperation->input_tensors[1]->index);
+         break;
+      default:
+         DBG("poperation->type %d\n", poperation->type);
+         unreachable("Unsupported ML operation type");
+      }
+   }
+
+   return tensor_count + 1;
+}
+
+static bool
+tensor_quantization_supported(struct pipe_tensor *tensor)
+{
+   /*
+    * Per-axis quantization not supported, for details see:
+    * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
+    */
+   return tensor->scales == NULL && tensor->zero_points == NULL;
+}
+
+bool
+rkt_ml_operation_supported(struct pipe_context *pcontext,
+                           const struct pipe_ml_operation *operation)
+{
+   bool supported = false;
+
+   switch (operation->type) {
+   case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+      struct pipe_tensor *input_tensor = operation->input_tensors[0];
+      struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
+      struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
+      struct pipe_tensor *output_tensor = operation->output_tensors[0];
+
+      // Dilation and per-axis quantization not yet implemented
+      if (tensor_quantization_supported(input_tensor) &&
+          tensor_quantization_supported(weight_tensor) &&
+          tensor_quantization_supported(bias_tensor) &&
+          tensor_quantization_supported(output_tensor) &&
+          operation->conv.dilation_width_factor == 1 &&
+          operation->conv.dilation_height_factor == 1)
+         supported = true;
+
+      break;
+   }
+   case PIPE_ML_OPERATION_TYPE_ADD:
+      supported = operation->input_tensors[0]->resource == NULL &&
+                  operation->input_tensors[1]->resource == NULL;
+      break;
+   default:
+      supported = false;
+   }
+
+   return supported;
+}
+
+struct pipe_ml_subgraph *
+rkt_ml_subgraph_create(struct pipe_context *pcontext,
+                       const struct pipe_ml_operation *poperations,
+                       unsigned count)
+{
+   struct rkt_ml_subgraph *subgraph;
+   unsigned tensor_count;
+
+   subgraph = calloc(1, sizeof(*subgraph));
+   subgraph->base.context = pcontext;
+
+   tensor_count = count_tensors(poperations, count);
+   util_dynarray_init(&subgraph->tensors, NULL);
+   util_dynarray_init(&subgraph->operations, NULL);
+   if (!util_dynarray_resize(&subgraph->tensors, struct pipe_resource *,
+                             tensor_count))
+      return NULL;
+   memset(util_dynarray_begin(&subgraph->tensors), 0, subgraph->tensors.size);
+
+   /* Lower */
+   for (int i = 0; i < count; i++) {
+      struct rkt_operation operation = {0};
+      operation.add_tensor = -1;
+
+      switch (poperations[i].type) {
+      case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
+         lower_convolution(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, struct rkt_operation,
+                              operation);
+         break;
+      case PIPE_ML_OPERATION_TYPE_ADD: {
+         /* Fuse tensor addition into convolution*/
+         struct rkt_operation *input_op_1 =
+            find_producer(subgraph, poperations[i].input_tensors[1]->index);
+         struct rkt_operation *input_op_2 =
+            find_producer(subgraph, poperations[i].input_tensors[0]->index);
+
+         assert(input_op_1);
+         assert(input_op_2);
+
+         if (input_op_1 == NULL) {
+            /* Graph input */
+            input_op_2->add_tensor = poperations[i].input_tensors[1]->index;
+         } else {
+            input_op_1->addition_input = true;
+            input_op_2->add_tensor = input_op_1->output_index;
+         }
+
+         input_op_2->output_index = poperations[i].output_tensors[0]->index;
+         input_op_2->addition_offset =
+            0x80 - poperations[i].input_tensors[1]->zero_point;
+         input_op_2->addition_scale = poperations[i].input_tensors[1]->scale;
+
+         break;
+      }
+      default:
+         DBG("poperation->type %d\n", poperations[i].type);
+         unreachable("Unsupported ML operation type");
+      }
+   }
+
+   /* Create input tensors */
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      unsigned input_channels_1 =
+         DIV_ROUND_UP(operation->input_channels, FEATURE_ATOMIC_SIZE) * 2;
+      unsigned input_channels_2 = FEATURE_ATOMIC_SIZE;
+      unsigned input_size = operation->input_width * operation->input_height *
+                            input_channels_1 * input_channels_2;
+
+      create_tensor(subgraph, operation->input_index, input_size);
+   }
+
+   /* Create output tensors */
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      struct rkt_resource *res =
+         rkt_get_tensor(subgraph, operation->output_index);
+      if (res != NULL)
+         continue;
+
+      create_tensor(subgraph, operation->output_index,
+                    calc_raw_output_size(operation));
+   }
+
+   /* Compile */
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      rkt_split_tasks(subgraph, operation);
+      compile_operation(subgraph, operation);
+   }
+
+   return &subgraph->base;
+}
+
+void
+rkt_ml_subgraph_invoke(struct pipe_context *pcontext,
+                       struct pipe_ml_subgraph *psubgraph,
+                       unsigned inputs_count, unsigned input_idxs[],
+                       void *inputs[], bool is_signed[])
+{
+   struct rkt_screen *screen = rkt_screen(pcontext->screen);
+   struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+   int ret;
+
+   DBG("Processing input\n");
+
+   for (int i = 0; i < inputs_count; i++) {
+      struct rkt_operation *operation =
+         find_first_consumer(subgraph, input_idxs[i]);
+      struct pipe_resource *input =
+         &rkt_get_tensor(subgraph, input_idxs[i])->base;
+      unsigned input_channels = operation->input_channels;
+      unsigned output_channels = operation->output_channels;
+
+      struct rkt_resource *input_tensor =
+         rkt_get_tensor(subgraph, operation->input_index);
+      if (output_channels == 1 && input_channels == 1 &&
+          !operation->addition_input && (operation->add_tensor == -1)) {
+         pipe_buffer_copy(pcontext, &input_tensor->base, input, 0, 0,
+                          pipe_buffer_size(input));
+      } else {
+         unsigned input_width = operation->input_width;
+         unsigned input_height = operation->input_height;
+         unsigned zero_point = operation->input_zero_point;
+         struct pipe_transfer *transfer_out;
+         uint8_t(*input_in)[input_height][input_channels] = inputs[i];
+         uint8_t *map = pipe_buffer_map(pcontext, &input_tensor->base,
+                                        PIPE_MAP_WRITE, &transfer_out);
+
+         DBG("Converting data\n");
+
+         /*
+          * From the NVDLA docs: "For int8, one element of data refers to an 8-bit
+          * signed integer." But only when transposing do we seem to need to
+          * convert to signed. The DMA unit seems to be able to convert from
+          * unsigned to signed though.
+          */
+         if (input_channels == 1) {
+            unsigned n = 0;
+            for (int x = 0; x < input_width; x++) {
+               for (int y = 0; y < MAX2(input_height, FEATURE_ATOMIC_SIZE); y++) {
+                  if (y < input_height)
+                     map[n++] = input_in[x][y][0];
+                  else
+                     map[n++] = zero_point;
+               }
+            }
+         } else {
+            unsigned n = 0;
+            for (int u = 0; u < DIV_ROUND_UP(input_channels, FEATURE_ATOMIC_SIZE);
+                 u++) {
+               for (int x = 0; x < input_width; x++) {
+                  for (int y = 0; y < input_height; y++) {
+                     for (int c = 0; c < FEATURE_ATOMIC_SIZE; c++) {
+                        unsigned input_channel = c + u * FEATURE_ATOMIC_SIZE;
+                        if (input_channel < input_channels)
+                           map[n++] = input_in[x][y][input_channel] - 0x80;
+                        else
+                           map[n++] = zero_point - 0x80;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+            rkt_dump_buffer(map, "input", 0, 0, 0,
+                            rkt_get_tensor(subgraph, input_idxs[i])->bo_size);
+
+         DBG("Converted data\n");
+
+         pipe_buffer_unmap(pcontext, transfer_out);
+      }
+   }
+   DBG("Processed input\n");
+
+   DBG("Submitting graph\n");
+
+   struct util_dynarray jobs = {0};
+   util_dynarray_init(&jobs, NULL);
+
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation,
+                          operation) {
+      unsigned num_inputs = operation->add_tensor != -1 ? 2 : 1;
+      uint32_t *in_bo_handles = calloc(num_inputs, sizeof(uint32_t));
+      uint32_t *out_bo_handles = malloc(sizeof(uint32_t));
+
+      in_bo_handles[0] = rkt_get_tensor(subgraph, operation->input_index)->handle;
+
+      if (operation->add_tensor != -1)
+         in_bo_handles[1] =
+            rkt_get_tensor(subgraph, operation->add_tensor)->handle;
+
+      out_bo_handles[0] =
+         rkt_get_tensor(subgraph, operation->output_index)->handle;
+
+      if (operation->reuse_weights_cbuf) {
+         /* Submit all tasks to the same core, so weights can be reused */
+         unsigned num_tasks =
+            util_dynarray_num_elements(&operation->tasks, struct split_task);
+         struct drm_rocket_task *tasks = calloc(num_tasks, sizeof(*tasks));
+         unsigned task_count = 0;
+         util_dynarray_foreach (&operation->tasks, struct split_task, task) {
+            tasks[task_count].regcmd = task->regcfg_addr;
+            tasks[task_count].regcmd_count = task->regcfg_amount;
+            task_count++;
+         }
+         struct drm_rocket_job job = {0};
+         job.task_struct_size = sizeof(struct drm_rocket_task);
+         job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles;
+         job.in_bo_handle_count = num_inputs;
+         job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles;
+         job.out_bo_handle_count = 1;
+         job.tasks = (uint64_t)tasks;
+         job.task_count = task_count;
+         util_dynarray_append(&jobs, struct drm_rocket_job, job);
+      } else {
+         /* Spread tasks among cores, for parallelism */
+         util_dynarray_foreach (&operation->tasks, struct split_task, task) {
+            struct drm_rocket_task *ktask = calloc(1, sizeof(*ktask));
+            ktask->regcmd = task->regcfg_addr;
+            ktask->regcmd_count = task->regcfg_amount;
+
+            struct drm_rocket_job job = {0};
+            job.task_struct_size = sizeof(struct drm_rocket_task);
+            job.in_bo_handles = (uint64_t)(uintptr_t)in_bo_handles;
+            job.in_bo_handle_count = num_inputs;
+            job.out_bo_handles = (uint64_t)(uintptr_t)out_bo_handles;
+            job.out_bo_handle_count = 1;
+            job.tasks = (uint64_t)ktask;
+            job.task_count = 1;
+            util_dynarray_append(&jobs, struct drm_rocket_job, job);
+         }
+      }
+   }
+
+   struct drm_rocket_submit submit = {0};
+   submit.job_struct_size = sizeof(struct drm_rocket_job);
+   submit.jobs = (uint64_t)util_dynarray_begin(&jobs);
+   submit.job_count = util_dynarray_num_elements(&jobs, struct drm_rocket_job);
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_ROCKET_SUBMIT, &submit);
+   assert(ret == 0);
+
+   util_dynarray_foreach (&jobs, struct drm_rocket_job, job) {
+      free((void *)job->in_bo_handles);
+      free((void *)job->out_bo_handles);
+      free((void *)job->tasks);
+   }
+   util_dynarray_fini(&jobs);
+
+   DBG("Submitted graph\n");
+}
+
+void
+rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                             struct pipe_ml_subgraph *psubgraph,
+                             unsigned outputs_count,
+                             unsigned output_idxs[], void *outputs[],
+                             bool is_signed[])
+{
+   struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+
+   DBG("Processing output\n");
+
+   for (int i = 0; i < outputs_count; i++) {
+
+      struct rkt_operation *operation = find_producer(subgraph, output_idxs[i]);
+      struct rkt_resource *output_tensor =
+         rkt_get_tensor(subgraph, output_idxs[i]);
+      struct pipe_transfer *transfer = NULL;
+      uint8_t *raw_output;
+      uint8_t(*output_in)[operation->output_height][operation->output_width]
+                         [FEATURE_ATOMIC_SIZE];
+      uint8_t(*output_out)[operation->output_width][operation->output_channels];
+
+      DBG("Before pipe_buffer_map\n");
+      raw_output = pipe_buffer_map(pcontext, &output_tensor->base, PIPE_MAP_READ,
+                                   &transfer);
+      DBG("After pipe_buffer_map\n");
+
+      DBG("Converting data\n");
+
+      output_in = (void *)raw_output;
+      output_out = (void *)outputs[i];
+
+      if (DBG_ENABLED(ROCKET_DBG_DUMP_BOS))
+         rkt_dump_buffer(raw_output, "output", 0, 0, 0, output_tensor->bo_size);
+
+      for (int oc = 0; oc < operation->output_channels; oc++) {
+         for (int x = 0; x < operation->output_width; x++) {
+            for (int y = 0; y < operation->output_height; y++) {
+               unsigned c = oc % FEATURE_ATOMIC_SIZE;
+               unsigned g = oc / FEATURE_ATOMIC_SIZE;
+               output_out[y][x][oc] = output_in[g][y][x][c] + 0x80;
+            }
+         }
+      }
+
+      DBG("Converted data\n");
+
+      pipe_buffer_unmap(pcontext, transfer);
+   }
+
+   DBG("Processed output\n");
+}
+
+static void
+free_operation(struct rkt_operation *operation)
+{
+   util_dynarray_fini(&operation->tasks);
+   pipe_resource_reference(&operation->regcmd, NULL);
+   pipe_resource_reference(&operation->weights, NULL);
+   pipe_resource_reference(&operation->biases, NULL);
+}
+
+void
+rkt_ml_subgraph_destroy(struct pipe_context *context,
+                        struct pipe_ml_subgraph *psubgraph)
+{
+   struct rkt_ml_subgraph *subgraph = (struct rkt_ml_subgraph *)(psubgraph);
+
+   util_dynarray_foreach (&subgraph->operations, struct rkt_operation, operation)
+      free_operation(operation);
+   util_dynarray_fini(&subgraph->operations);
+
+   util_dynarray_foreach (&subgraph->tensors, struct pipe_resource *, tensor)
+      if (tensor)
+         pipe_resource_reference(tensor, NULL);
+   util_dynarray_fini(&subgraph->tensors);
+
+   free(subgraph);
+}
--- a/src/gallium/drivers/rocket/rkt_ml.h
+++ b/src/gallium/drivers/rocket/rkt_ml.h
@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_ML_H
+#define RKT_ML_H
+
+#include <util/u_dynarray.h>
+
+#include "rkt_device.h"
+
+// http://nvdla.org/hw/v1/ias/unit_description.html#convolution-buffer
+#define CBUF_BANK_SIZE        32768
+#define CBUF_BANKS            12
+#define CBUF_ENTRIES_PER_BANK 256
+#define CBUF_ENTRY_SIZE       (CBUF_BANK_SIZE / CBUF_ENTRIES_PER_BANK)
+#define FEATURE_ATOMIC_SIZE   16
+#define WEIGHT_ATOMIC_SIZE    32
+#define ATOMIC_K_SIZE         16
+
+struct split_task {
+   unsigned num;
+
+   unsigned top_slice;
+   unsigned bottom_slice;
+   unsigned num_overlap_slices;
+   unsigned num_retain_slices;
+   unsigned convolutions;
+
+   unsigned pad_top;
+   unsigned pad_bottom;
+   unsigned pad_left;
+   unsigned pad_right;
+
+   unsigned stride_x;
+   unsigned stride_y;
+
+   unsigned input_width;
+   unsigned input_height;
+   unsigned input_channels;
+   unsigned input_channels_real;
+   unsigned input_zero_point;
+   float input_scale;
+   unsigned input_data_entries;
+   int input_line_stride;
+   int input_surface_stride;
+   unsigned input_offset;
+
+   unsigned output_width;
+   unsigned output_height;
+   unsigned output_channels;
+   unsigned output_channels_real;
+   unsigned output_zero_point;
+   float output_scale;
+   int output_surface_stride;
+   unsigned output_offset;
+
+   unsigned weights_width;
+   unsigned weights_height;
+   unsigned weights_kernels;
+   unsigned weights_zero_point;
+   float weights_scale;
+
+   unsigned input_banks;
+   unsigned weights_banks;
+
+   unsigned atomic_count;
+   unsigned surfaces_per_row;
+
+   unsigned regcfg_amount;
+   uint32_t regcfg_addr;
+};
+
+struct rkt_operation {
+   struct pipe_resource *regcmd;
+   struct pipe_resource *weights;
+   struct pipe_resource *biases;
+
+   bool depthwise;
+   bool reuse_weights_cbuf;
+   unsigned truncate_bits;
+   bool padding_same;
+   unsigned stride;
+
+   bool addition_input;
+   int addition_offset;
+   float addition_scale;
+
+   unsigned input_index;
+   unsigned input_width;
+   unsigned input_height;
+   unsigned input_channels;
+   uint8_t input_zero_point;
+   float input_scale;
+
+   unsigned output_index;
+   unsigned output_width;
+   unsigned output_height;
+   unsigned output_channels;
+   uint8_t output_zero_point;
+   float output_scale;
+
+   unsigned weights_width;
+   unsigned weights_height;
+   uint8_t weights_zero_point;
+   float weights_scale;
+
+   int add_tensor;
+
+   struct util_dynarray tasks; /* struct split_task */
+};
+
+struct rkt_ml_subgraph {
+   struct pipe_ml_subgraph base;
+
+   struct util_dynarray operations; /* rkt_operation */
+   struct util_dynarray tensors;    /* pipe_resource* */
+};
+
+bool
+rkt_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
+
+struct pipe_ml_subgraph *
+rkt_ml_subgraph_create(struct pipe_context *pcontext,
+                       const struct pipe_ml_operation *poperations,
+                       unsigned count);
+
+void rkt_ml_subgraph_invoke(struct pipe_context *pcontext,
+                            struct pipe_ml_subgraph *psubgraph,
+                            unsigned inputs_count, unsigned input_idxs[],
+                            void *inputs[], bool is_signed[]);
+
+void rkt_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                                  struct pipe_ml_subgraph *psubgraph,
+                                  unsigned outputs_count,
+                                  unsigned output_idxs[], void *outputs[],
+                                  bool is_signed[]);
+
+void rkt_ml_subgraph_destroy(struct pipe_context *context,
+                             struct pipe_ml_subgraph *psubgraph);
+
+struct rkt_resource *rkt_get_tensor(struct rkt_ml_subgraph *subgraph,
+                                    unsigned idx);
+
+bool rkt_is_depthwise(const struct pipe_ml_operation *poperation);
+
+void rkt_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                     int suboperation_nr, int offset, unsigned size);
+
+#endif /* RKT_ML_H */
--- a/src/gallium/drivers/rocket/rkt_regcmd.c
+++ b/src/gallium/drivers/rocket/rkt_regcmd.c
@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_regcmd.h"
+#include "rkt_ml.h"
+#include "rkt_registers.h"
+
+static void
+emit_raw(struct util_dynarray *regs, uint32_t target, uint32_t reg,
+         uint32_t value)
+{
+   uint64_t packed_value = 0;
+   packed_value = ((uint64_t)target) << 48;
+   packed_value |= ((uint64_t)value) << 16;
+   packed_value |= (uint64_t)reg;
+
+   util_dynarray_append(regs, uint64_t, packed_value);
+}
+
+static void
+emit(struct util_dynarray *regs, uint32_t reg, uint32_t value)
+{
+   uint32_t target = rkt_get_target(reg) + 0x1;
+   emit_raw(regs, target, reg, value);
+}
+
+#define EMIT(offset, value) emit(regs, offset, value);
+
+static void
+fill_first_regcmd(struct rkt_ml_subgraph *subgraph,
+                  const struct rkt_operation *operation,
+                  struct util_dynarray *regs, unsigned task_num)
+{
+   struct split_task *task =
+      util_dynarray_element(&operation->tasks, struct split_task, task_num);
+   unsigned num_tasks =
+      util_dynarray_num_elements(&operation->tasks, struct split_task);
+   unsigned output_zero_point = task->output_zero_point;
+   unsigned weights_zero_point = task->weights_zero_point;
+   unsigned offset = output_zero_point - 0x80;
+
+   uint32_t con0 = CNA_CBUF_CON0_WEIGHT_BANK(task->weights_banks) |
+                   CNA_CBUF_CON0_DATA_BANK(task->input_banks);
+   if (task_num > 0 && operation->reuse_weights_cbuf)
+      con0 |= CNA_CBUF_CON0_WEIGHT_REUSE(1);
+
+   EMIT(REG_CNA_CBUF_CON0, con0);
+
+   EMIT(REG_CNA_DCOMP_REGNUM, 0);
+   EMIT(REG_CNA_DCOMP_CTRL, 0);
+
+   uint32_t con1 = 0x0;
+   if (task->input_channels_real == 1) {
+      con1 |= CNA_CONV_CON1_NONALIGN_DMA(1) | CNA_CONV_CON1_GROUP_LINE_OFF(1) |
+              CNA_CONV_CON1_ARGB_IN(8);
+   }
+
+   if (operation->depthwise)
+      con1 |= CNA_CONV_CON1_CONV_MODE(3);
+
+   EMIT(REG_CNA_CONV_CON1, con1);
+
+   EMIT(REG_DPU_S_POINTER, DPU_S_POINTER_POINTER_PP_MODE(1) |
+                              DPU_S_POINTER_EXECUTER_PP_EN(1) |
+                              DPU_S_POINTER_POINTER_PP_EN(1));
+   EMIT(REG_DPU_RDMA_RDMA_S_POINTER,
+        DPU_RDMA_RDMA_S_POINTER_POINTER_PP_MODE(1) |
+           DPU_RDMA_RDMA_S_POINTER_EXECUTER_PP_EN(1) |
+           DPU_RDMA_RDMA_S_POINTER_POINTER_PP_EN(1));
+   EMIT(REG_CNA_CONV_CON1, con1);
+   EMIT(REG_CNA_CONV_CON2,
+        CNA_CONV_CON2_FEATURE_GRAINS(
+           50 + task->stride_y + 1)); /* Magic: Seems to pass the most tests */
+   EMIT(REG_CNA_CONV_CON3, CNA_CONV_CON3_CONV_X_STRIDE(task->stride_x) |
+                              CNA_CONV_CON3_CONV_Y_STRIDE(task->stride_y));
+   EMIT(REG_CNA_DATA_SIZE0,
+        CNA_DATA_SIZE0_DATAIN_WIDTH(task->input_width) |
+           CNA_DATA_SIZE0_DATAIN_HEIGHT(task->input_height));
+
+   EMIT(REG_CNA_DATA_SIZE1,
+        CNA_DATA_SIZE1_DATAIN_CHANNEL_REAL(task->input_channels_real - 1) |
+           CNA_DATA_SIZE1_DATAIN_CHANNEL(task->input_channels));
+
+   EMIT(REG_CNA_DATA_SIZE2, CNA_DATA_SIZE2_DATAOUT_WIDTH(task->output_width));
+   EMIT(REG_CNA_DATA_SIZE3, CNA_DATA_SIZE3_DATAOUT_ATOMICS(task->atomic_count));
+   EMIT(REG_CNA_WEIGHT_SIZE0, task->weights_width * task->weights_height *
+                                 task->input_channels * task->weights_kernels);
+   EMIT(REG_CNA_WEIGHT_SIZE1,
+        task->weights_width * task->weights_height * task->input_channels);
+   EMIT(REG_CNA_WEIGHT_SIZE2,
+        CNA_WEIGHT_SIZE2_WEIGHT_WIDTH(task->weights_width) |
+           CNA_WEIGHT_SIZE2_WEIGHT_HEIGHT(task->weights_height) |
+           CNA_WEIGHT_SIZE2_WEIGHT_KERNELS(task->weights_kernels));
+
+   EMIT(REG_CNA_CBUF_CON0, con0);
+
+   EMIT(REG_CNA_CBUF_CON1, CNA_CBUF_CON1_DATA_ENTRIES(task->input_data_entries));
+
+   if (task->input_channels_real == 1) {
+      unsigned truncate = 14;
+      unsigned scale = 16384;
+      unsigned offset = 65408;
+
+      if (operation->addition_input || operation->add_tensor != -1) {
+         truncate = 15;
+         scale = 32388;
+      }
+
+      EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_CVT_TRUNCATE_3(truncate) |
+                                CNA_CVT_CON0_CVT_TRUNCATE_2(truncate) |
+                                CNA_CVT_CON0_CVT_TRUNCATE_1(truncate) |
+                                CNA_CVT_CON0_CVT_TRUNCATE_0(truncate));
+      EMIT(REG_CNA_CVT_CON1,
+           CNA_CVT_CON1_CVT_SCALE0(scale) | CNA_CVT_CON1_CVT_OFFSET0(offset));
+      EMIT(REG_CNA_CVT_CON2,
+           CNA_CVT_CON2_CVT_SCALE1(scale) | CNA_CVT_CON2_CVT_OFFSET1(offset));
+      EMIT(REG_CNA_CVT_CON3,
+           CNA_CVT_CON3_CVT_SCALE2(scale) | CNA_CVT_CON3_CVT_OFFSET2(offset));
+      EMIT(REG_CNA_CVT_CON4,
+           CNA_CVT_CON4_CVT_SCALE3(scale) | CNA_CVT_CON4_CVT_OFFSET3(offset));
+   } else {
+      EMIT(REG_CNA_CVT_CON0, CNA_CVT_CON0_DATA_SIGN(1) |
+                                CNA_CVT_CON0_CVT_TYPE(1) |
+                                CNA_CVT_CON0_CVT_BYPASS(1));
+      EMIT(REG_CNA_CVT_CON1, CNA_CVT_CON1_CVT_SCALE0(1));
+      EMIT(REG_CNA_CVT_CON2, CNA_CVT_CON2_CVT_SCALE1(1));
+      EMIT(REG_CNA_CVT_CON3, CNA_CVT_CON3_CVT_SCALE2(1));
+      EMIT(REG_CNA_CVT_CON4, CNA_CVT_CON4_CVT_SCALE3(1));
+   }
+
+   EMIT(REG_CNA_FC_CON0, 0);
+   EMIT(REG_CNA_FC_CON1, 0);
+   EMIT(REG_CNA_PAD_CON0, CNA_PAD_CON0_PAD_LEFT(task->pad_left) |
+                             CNA_PAD_CON0_PAD_TOP(task->pad_top));
+   EMIT(REG_CNA_FEATURE_DATA_ADDR,
+        rkt_get_tensor(subgraph, operation->input_index)->phys_addr +
+           task->input_offset);
+   EMIT(REG_CNA_FC_CON2, 0);
+   EMIT(REG_CNA_DMA_CON0,
+        CNA_DMA_CON0_WEIGHT_BURST_LEN(15) | CNA_DMA_CON0_DATA_BURST_LEN(15));
+   EMIT(REG_CNA_DMA_CON1, CNA_DMA_CON1_LINE_STRIDE(task->input_line_stride));
+   EMIT(REG_CNA_DMA_CON2, CNA_DMA_CON2_SURF_STRIDE(task->input_surface_stride));
+
+   EMIT(REG_CNA_FC_DATA_SIZE0,
+        CNA_FC_DATA_SIZE0_DMA_WIDTH(operation->input_width) |
+           CNA_FC_DATA_SIZE0_DMA_HEIGHT(task->input_height));
+
+   EMIT(REG_CNA_FC_DATA_SIZE1,
+        CNA_FC_DATA_SIZE1_DMA_CHANNEL(task->input_channels));
+   EMIT(REG_CNA_DCOMP_CTRL, 0);
+   EMIT(REG_CNA_DCOMP_REGNUM, 0);
+   EMIT(REG_CNA_DCOMP_ADDR0, rkt_resource(operation->weights)->phys_addr);
+   EMIT(REG_CNA_DCOMP_AMOUNT0, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT1, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT2, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT3, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT4, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT5, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT6, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT7, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT8, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT9, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT10, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT11, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT12, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT13, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT14, 0);
+   EMIT(REG_CNA_DCOMP_AMOUNT15, 0);
+
+   if (task->input_channels_real == 1) {
+      EMIT(REG_CNA_CVT_CON5, 65535);
+   } else {
+      EMIT(REG_CNA_CVT_CON5, 0);
+   }
+
+   int32_t pad_con1;
+   if (task->weights_width >= 3 && task->input_zero_point == 0x0)
+      pad_con1 = 0xffff8080;
+   else
+      pad_con1 = task->input_zero_point - 0x80;
+
+   if (operation->addition_input || operation->add_tensor != -1)
+      pad_con1 = 0xffffff80;
+
+   if (operation->depthwise && task->input_zero_point == 0x8b)
+      pad_con1 = 0x0b0b;
+
+   EMIT(REG_CNA_PAD_CON1, pad_con1);
+
+   uint32_t misc_cfg = CORE_MISC_CFG_QD_EN(1);
+   if (operation->depthwise)
+      misc_cfg |= CORE_MISC_CFG_DW_EN(1);
+
+   EMIT(REG_CORE_MISC_CFG, misc_cfg);
+   EMIT(REG_CORE_DATAOUT_SIZE_0,
+        CORE_DATAOUT_SIZE_0_DATAOUT_HEIGHT(task->output_height - 1) |
+           CORE_DATAOUT_SIZE_0_DATAOUT_WIDTH(task->output_width - 1));
+   EMIT(REG_CORE_DATAOUT_SIZE_1,
+        CORE_DATAOUT_SIZE_1_DATAOUT_CHANNEL(task->output_channels - 1));
+   EMIT(REG_CORE_CLIP_TRUNCATE,
+        CORE_CLIP_TRUNCATE_CLIP_TRUNCATE(operation->truncate_bits));
+   emit_raw(regs, CORE | 0x1, 0x3030, 0);
+
+   uint32_t feat_mode_cfg =
+      DPU_FEATURE_MODE_CFG_BURST_LEN(15) | DPU_FEATURE_MODE_CFG_OUTPUT_MODE(2);
+   if (operation->depthwise)
+      feat_mode_cfg |= DPU_FEATURE_MODE_CFG_CONV_MODE(3);
+
+   EMIT(REG_DPU_FEATURE_MODE_CFG, feat_mode_cfg);
+   EMIT(REG_DPU_DATA_FORMAT, 0);
+   EMIT(REG_DPU_OFFSET_PEND, 0);
+   EMIT(REG_DPU_DST_BASE_ADDR,
+        rkt_get_tensor(subgraph, operation->output_index)->phys_addr +
+           task->output_offset);
+   EMIT(REG_DPU_DST_SURF_STRIDE,
+        DPU_DST_SURF_STRIDE_DST_SURF_STRIDE(task->output_surface_stride));
+   EMIT(REG_DPU_DATA_CUBE_WIDTH,
+        DPU_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1));
+   EMIT(REG_DPU_DATA_CUBE_HEIGHT,
+        DPU_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1));
+   EMIT(REG_DPU_DATA_CUBE_NOTCH_ADDR, 0);
+   EMIT(REG_DPU_DATA_CUBE_CHANNEL,
+        DPU_DATA_CUBE_CHANNEL_ORIG_CHANNEL(task->output_channels_real - 1) |
+           DPU_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1));
+   EMIT(REG_DPU_BS_CFG, DPU_BS_CFG_BS_ALU_ALGO(2) | DPU_BS_CFG_BS_ALU_SRC(1) |
+                           DPU_BS_CFG_BS_RELU_BYPASS(1) |
+                           DPU_BS_CFG_BS_MUL_BYPASS(1));
+   EMIT(REG_DPU_BS_ALU_CFG, 0);
+   EMIT(REG_DPU_BS_MUL_CFG, 0);
+   EMIT(REG_DPU_BS_RELUX_CMP_VALUE, 0);
+
+   if (operation->depthwise) {
+      EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(3) |
+                                 DPU_BS_OW_CFG_SIZE_E_1(3) |
+                                 DPU_BS_OW_CFG_SIZE_E_0(3));
+   } else {
+      EMIT(REG_DPU_BS_OW_CFG, DPU_BS_OW_CFG_SIZE_E_2(1) |
+                                 DPU_BS_OW_CFG_SIZE_E_1(1) |
+                                 DPU_BS_OW_CFG_SIZE_E_0(1));
+   }
+
+   EMIT(REG_DPU_BS_OW_OP, DPU_BS_OW_OP_OW_OP(0x80 - weights_zero_point));
+
+   EMIT(REG_DPU_WDMA_SIZE_0,
+        DPU_WDMA_SIZE_0_CHANNEL_WDMA(task->output_channels - 1));
+   EMIT(REG_DPU_WDMA_SIZE_1,
+        DPU_WDMA_SIZE_1_HEIGHT_WDMA(task->output_height - 1) |
+           DPU_WDMA_SIZE_1_WIDTH_WDMA(task->output_width - 1));
+   EMIT(REG_DPU_BN_CFG,
+        DPU_BN_CFG_BN_RELU_BYPASS(1) | DPU_BN_CFG_BN_MUL_BYPASS(1) |
+           DPU_BN_CFG_BN_ALU_BYPASS(1) | DPU_BN_CFG_BN_BYPASS(1));
+   EMIT(REG_DPU_BN_ALU_CFG, 0);
+   EMIT(REG_DPU_BN_MUL_CFG, 0);
+   EMIT(REG_DPU_BN_RELUX_CMP_VALUE, 0);
+
+   if (operation->add_tensor != -1) {
+      EMIT(REG_DPU_EW_CFG,
+           DPU_EW_CFG_EW_CVT_TYPE(1) | DPU_EW_CFG_EW_DATA_MODE(1) |
+              DPU_EW_CFG_EDATA_SIZE(1) | DPU_EW_CFG_EW_ALU_ALGO(2) |
+              DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_LUT_BYPASS(1) |
+              DPU_EW_CFG_EW_OP_SRC(1));
+
+      /* See http://nvdla.org/hw/v1/ias/precision.html#element-wise */
+      EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, operation->addition_offset);
+
+      float add_scale = 0.0;
+      if (fabs(operation->addition_scale - 0.090192) < 0.00001) {
+         add_scale = 299.671889248;
+      } else if (fabs(operation->addition_scale - 0.399250) < 0.00001) {
+         add_scale = 1326.499209406;
+      } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) {
+         add_scale = 780.34375;
+      } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) {
+         add_scale = 715.5625;
+      } else if (fabs(operation->addition_scale - 0.213016) < 0.00001) {
+         add_scale = 564.6875;
+      } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) {
+         add_scale = 499.796875;
+      } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) {
+         add_scale = 488.203125;
+      } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) {
+         add_scale = 602.90625;
+      } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) {
+         add_scale = 271.921875;
+      } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) {
+         add_scale = 262.90625;
+      } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) {
+         add_scale = 450.140625;
+      } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) {
+         add_scale = 212.1953125;
+      } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) {
+         add_scale = 368.28125;
+      } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) {
+         add_scale = 416.421875;
+      } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) {
+         add_scale = 305.421875;
+      } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) {
+         add_scale = 313.671875;
+      } else {
+         add_scale = 0.0;
+      }
+
+      uint32_t add_scale_bits = fui(add_scale);
+      /* Taken from
+       * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130
+       */
+      unsigned add_shift = 127 + 31 - 32 - (add_scale_bits >> 23) + 16;
+
+      unsigned scale = ((add_scale_bits >> 9) & 0x7fff);
+      if (scale < 1 << 14)
+         scale |= 1 << 14;
+
+      EMIT(REG_DPU_EW_CVT_SCALE_VALUE,
+           DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SHIFT(add_shift - 1) |
+              DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(scale));
+
+      EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0x0);
+
+      if (fabs(operation->addition_scale - 0.213016) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25914));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.244231) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28927));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.283416) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(26050));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.171151) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffd);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(28937));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.164588) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24877));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.204098) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x0);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(23272));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.116532) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffff8);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(32292));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.134499) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0xfffffffb);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(24153));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.220141) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0xb);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27655));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.094560) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x5);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(20432));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.093230) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0xffffffff);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(25449));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.100618) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, offset);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(16874));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(23));
+      } else if (fabs(operation->addition_scale - 0.422037) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x1);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(22559));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else if (fabs(operation->addition_scale - 0.364902) < 0.00001) {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x4);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(18589));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(24));
+      } else {
+         EMIT(REG_DPU_OUT_CVT_OFFSET, 0x6);
+         EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(27676));
+         EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(25));
+      }
+   } else {
+      EMIT(REG_DPU_EW_CFG,
+           DPU_EW_CFG_EW_RELU_BYPASS(1) | DPU_EW_CFG_EW_OP_CVT_BYPASS(1) |
+              DPU_EW_CFG_EW_LUT_BYPASS(1) | DPU_EW_CFG_EW_OP_BYPASS(1) |
+              DPU_EW_CFG_EW_BYPASS(1));
+      EMIT(REG_DPU_EW_CVT_OFFSET_VALUE, 0);
+      EMIT(REG_DPU_EW_CVT_SCALE_VALUE, DPU_EW_CVT_SCALE_VALUE_EW_OP_CVT_SCALE(1));
+      EMIT(REG_DPU_EW_RELUX_CMP_VALUE, 0);
+      EMIT(REG_DPU_OUT_CVT_OFFSET, offset);
+
+      float conv_scale =
+         (task->input_scale * task->weights_scale) / task->output_scale;
+      // DBG("conv_scale %f\n", conv_scale);
+      uint32_t scale_bits = fui(conv_scale);
+      /* Taken from
+       * https://github.com/pytorch/QNNPACK/blob/master/src/qnnpack/requantization.h#L130
+       */
+      unsigned shift = 127 + 31 - 32 - (scale_bits >> 23) + 16;
+
+      if (operation->truncate_bits > 0)
+         shift--;
+
+      unsigned scale = ((scale_bits >> 9) & 0x7fff) + 1;
+      if (scale < 1 << 14)
+         scale |= 1 << 14;
+
+      EMIT(REG_DPU_OUT_CVT_SCALE, DPU_OUT_CVT_SCALE_OUT_CVT_SCALE(scale));
+      EMIT(REG_DPU_OUT_CVT_SHIFT, DPU_OUT_CVT_SHIFT_OUT_CVT_SHIFT(shift - 1));
+   }
+
+   EMIT(REG_DPU_EW_OP_VALUE_0, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_1, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_2, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_3, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_4, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_5, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_6, 0);
+   EMIT(REG_DPU_EW_OP_VALUE_7, 0);
+   EMIT(REG_DPU_SURFACE_ADD, DPU_SURFACE_ADD_SURF_ADD(task->surfaces_per_row));
+   emit_raw(regs, DPU | 0x1, 0x40c4, 0);
+   EMIT(REG_DPU_LUT_ACCESS_CFG, 0);
+   EMIT(REG_DPU_LUT_ACCESS_DATA, 0);
+   EMIT(REG_DPU_LUT_CFG, 0);
+   EMIT(REG_DPU_LUT_INFO, 0);
+   EMIT(REG_DPU_LUT_LE_START, 0);
+   EMIT(REG_DPU_LUT_LE_END, 0);
+   EMIT(REG_DPU_LUT_LO_START, 0);
+   EMIT(REG_DPU_LUT_LO_END, 0);
+   EMIT(REG_DPU_LUT_LE_SLOPE_SCALE, 0);
+   EMIT(REG_DPU_LUT_LE_SLOPE_SHIFT, 0);
+   EMIT(REG_DPU_LUT_LO_SLOPE_SCALE, 0);
+   EMIT(REG_DPU_LUT_LO_SLOPE_SHIFT, 0);
+   EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_WIDTH,
+        DPU_RDMA_RDMA_DATA_CUBE_WIDTH_WIDTH(task->output_width - 1));
+   EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_HEIGHT,
+        DPU_RDMA_RDMA_DATA_CUBE_HEIGHT_HEIGHT(task->output_height - 1));
+   EMIT(REG_DPU_RDMA_RDMA_DATA_CUBE_CHANNEL,
+        DPU_RDMA_RDMA_DATA_CUBE_CHANNEL_CHANNEL(task->output_channels - 1));
+
+   if (operation->add_tensor != -1) {
+      EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR,
+           rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr +
+              task->output_offset);
+   } else {
+      EMIT(REG_DPU_RDMA_RDMA_SRC_BASE_ADDR, 0);
+   }
+
+   EMIT(REG_DPU_RDMA_RDMA_BRDMA_CFG, DPU_RDMA_RDMA_BRDMA_CFG_BRDMA_DATA_USE(1));
+   EMIT(REG_DPU_RDMA_RDMA_BS_BASE_ADDR,
+        rkt_resource(operation->biases)->phys_addr);
+   EMIT(REG_DPU_RDMA_RDMA_NRDMA_CFG, 0);
+   EMIT(REG_DPU_RDMA_RDMA_BN_BASE_ADDR, 0);
+
+   unsigned ew_stride =
+      MAX2(operation->output_width * operation->output_height, 12);
+
+   if (operation->add_tensor != -1) {
+      EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG,
+           DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_MODE(1) |
+              DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DATA_SIZE(1));
+      unsigned ew_base_offset =
+         operation->output_width * operation->output_height * ATOMIC_K_SIZE;
+      EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR,
+           rkt_get_tensor(subgraph, operation->add_tensor)->phys_addr +
+              task->output_offset + ew_base_offset);
+      EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE,
+           DPU_RDMA_RDMA_EW_SURF_STRIDE_EW_SURF_STRIDE(ew_stride));
+   } else {
+      EMIT(REG_DPU_RDMA_RDMA_ERDMA_CFG, DPU_RDMA_RDMA_ERDMA_CFG_ERDMA_DISABLE(1));
+      EMIT(REG_DPU_RDMA_RDMA_EW_BASE_ADDR, 0);
+      EMIT(REG_DPU_RDMA_RDMA_EW_SURF_STRIDE, 0);
+   }
+
+   uint32_t rdma_feat_mode_cfg = 0x0;
+
+   if (operation->add_tensor != -1) {
+      rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) |
+                            DPU_RDMA_RDMA_FEATURE_MODE_CFG_COMB_USE(5);
+   } else {
+      rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_BURST_LEN(15) |
+                            DPU_RDMA_RDMA_FEATURE_MODE_CFG_MRDMA_DISABLE(1);
+   }
+
+   if (operation->depthwise)
+      rdma_feat_mode_cfg |= DPU_RDMA_RDMA_FEATURE_MODE_CFG_CONV_MODE(3);
+
+   EMIT(REG_DPU_RDMA_RDMA_FEATURE_MODE_CFG, rdma_feat_mode_cfg);
+   EMIT(REG_DPU_RDMA_RDMA_SRC_DMA_CFG, 0);
+
+   unsigned surf_notch =
+      ew_stride +
+      task->output_width * (operation->output_height - task->output_height);
+
+   if (operation->input_width == 3) {
+      surf_notch = 15;
+   }
+
+   if (operation->add_tensor != -1) {
+      EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH,
+           DPU_RDMA_RDMA_SURF_NOTCH_SURF_NOTCH_ADDR(surf_notch));
+   } else {
+      EMIT(REG_DPU_RDMA_RDMA_SURF_NOTCH, 0);
+   }
+
+   EMIT(REG_DPU_RDMA_RDMA_PAD_CFG, 0);
+   EMIT(REG_DPU_RDMA_RDMA_WEIGHT,
+        DPU_RDMA_RDMA_WEIGHT_E_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_N_WEIGHT(1) |
+           DPU_RDMA_RDMA_WEIGHT_B_WEIGHT(1) | DPU_RDMA_RDMA_WEIGHT_M_WEIGHT(1));
+
+   if (operation->add_tensor != -1) {
+      EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH,
+           DPU_RDMA_RDMA_EW_SURF_NOTCH_EW_SURF_NOTCH(surf_notch));
+   } else {
+      EMIT(REG_DPU_RDMA_RDMA_EW_SURF_NOTCH, 0x0);
+   }
+
+   if (num_tasks == 1)
+      util_dynarray_append(regs, uint64_t, 0x0);
+   else
+      EMIT(REG_PC_BASE_ADDRESS, 0);
+
+   EMIT(REG_PC_REGISTER_AMOUNTS, 0);
+
+   /* TRM: before op_en, 64'h0041_xxxx_xxxx_xxxx must be set. */
+   util_dynarray_append(regs, uint64_t, 0x0041000000000000);
+
+   /* TRM: 64'h0081_0000_007f_0008 will set each block's op_en(CNA, CORE, ...,
+    * PPU_RDMA). */
+   emit_raw(regs, 0x81, REG_PC_OPERATION_ENABLE,
+            PC_OPERATION_ENABLE_RESERVED_0(14) | PC_OPERATION_ENABLE_OP_EN(1));
+}
+
+void
+rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph,
+                const struct rkt_operation *operation,
+                struct util_dynarray *regs, unsigned task_num)
+{
+   /*
+    * TODO: We should only need to set all the registers on the regcmd for the first
+    * task in an operation, but for now set them all to be sure.
+    */
+   fill_first_regcmd(subgraph, operation, regs, task_num);
+}
--- a/src/gallium/drivers/rocket/rkt_regcmd.h
+++ b/src/gallium/drivers/rocket/rkt_regcmd.h
@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_REGCMD_H
+#define RKT_REGCMD_H
+
+#include "rkt_ml.h"
+
+void rkt_fill_regcmd(struct rkt_ml_subgraph *subgraph,
+                     const struct rkt_operation *operation,
+                     struct util_dynarray *regs, unsigned task_num);
+
+#endif /* RKT_REGCMD_H */
--- a/src/gallium/drivers/rocket/rkt_task.c
+++ b/src/gallium/drivers/rocket/rkt_task.c
@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "rkt_task.h"
+#include "rkt_ml.h"
+
+static unsigned
+calc_entries_per_slice(struct rkt_operation *operation)
+{
+   unsigned bpe = sizeof(uint8_t);
+   unsigned atomics_per_entry = CBUF_ENTRY_SIZE / FEATURE_ATOMIC_SIZE;
+   unsigned total_c_atomics =
+      DIV_ROUND_UP(operation->input_channels * bpe, FEATURE_ATOMIC_SIZE);
+   unsigned last_c_atomics = total_c_atomics % atomics_per_entry;
+   unsigned int_c_entries =
+      (total_c_atomics / atomics_per_entry) * operation->input_width;
+   unsigned frac_c_entries =
+      (last_c_atomics == 3)
+         ? operation->input_width
+         : DIV_ROUND_UP(last_c_atomics * operation->input_width,
+                        atomics_per_entry);
+
+   return int_c_entries + frac_c_entries;
+}
+
+static unsigned
+calc_input_banks(struct rkt_operation *operation)
+{
+   unsigned entries_per_slice = calc_entries_per_slice(operation);
+   return DIV_ROUND_UP(entries_per_slice * operation->input_height,
+                       CBUF_ENTRIES_PER_BANK);
+}
+
+static unsigned
+calc_weights_banks(struct rkt_operation *operation)
+{
+   unsigned bpe = sizeof(uint8_t);
+   unsigned bytes = operation->weights_width * operation->weights_height *
+                    operation->input_channels * bpe;
+   unsigned entries;
+   unsigned banks;
+
+   if (!operation->depthwise)
+      bytes *= operation->output_channels;
+   entries = DIV_ROUND_UP(bytes, CBUF_ENTRY_SIZE);
+   banks = DIV_ROUND_UP(entries, CBUF_ENTRIES_PER_BANK);
+
+   /* Why do we need an extra bank? The calc above might be wrong on this HW */
+   banks++;
+
+   return banks;
+}
+
+static unsigned
+calc_line_stride(unsigned width)
+{
+   return width * ATOMIC_K_SIZE * sizeof(uint8_t);
+}
+
+static void
+calc_explicit_padding(const struct rkt_operation *operation,
+                      unsigned *pad_top, unsigned *pad_bottom,
+                      unsigned *pad_left, unsigned *pad_right)
+{
+   if (operation->padding_same && operation->weights_width > 1) {
+      /* Convert from implicit to explicit padding */
+      unsigned pad_along_width =
+         MAX2((operation->output_width - 1) * operation->stride +
+                 operation->weights_width - operation->input_width,
+              0);
+      unsigned pad_along_height =
+         MAX2((operation->output_height - 1) * operation->stride +
+                 operation->weights_height - operation->input_height,
+              0);
+      *pad_left = pad_along_height / 2;
+      *pad_right = pad_along_height - *pad_left;
+      *pad_top = pad_along_width / 2;
+      *pad_bottom = pad_along_width - *pad_top;
+   } else {
+      *pad_left = 0;
+      *pad_right = 0;
+      *pad_top = 0;
+      *pad_bottom = 0;
+   }
+}
+
+static void
+fill_task(struct rkt_ml_subgraph *subgraph,
+          struct rkt_operation *operation,
+          struct split_task *task)
+{
+   task->stride_x = operation->stride;
+   task->stride_y = operation->stride;
+
+   task->input_width = operation->input_width;
+   if (task->input_width == 8 &&
+       (operation->addition_input || operation->add_tensor != -1))
+      task->input_width *= 2;
+
+   task->input_height = operation->input_height;
+   task->input_channels =
+      ALIGN(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
+            FEATURE_ATOMIC_SIZE);
+   task->input_channels_real = operation->input_channels;
+   task->input_zero_point = operation->input_zero_point;
+   task->input_scale = operation->input_scale;
+
+   task->output_width = operation->output_width;
+   task->output_height = operation->output_height;
+
+   task->output_channels_real = operation->output_channels;
+   task->output_channels = ALIGN(MAX2(operation->output_channels, 32), 32);
+   if (operation->depthwise) {
+      if (task->output_channels_real <= 32)
+         task->output_channels *= 2;
+      task->output_channels = ALIGN(task->output_channels, 64);
+   }
+
+   task->output_zero_point = operation->output_zero_point;
+   task->output_scale = operation->output_scale;
+
+   if (task->input_channels_real == 1 &&
+       (task->output_channels_real > 1 ||
+        (operation->addition_input || operation->add_tensor != -1))) {
+      task->input_width = MAX2(task->input_width, FEATURE_ATOMIC_SIZE);
+      task->input_line_stride =
+         MAX2(calc_line_stride(operation->input_width) / FEATURE_ATOMIC_SIZE,
+              FEATURE_ATOMIC_SIZE);
+
+      if (operation->input_channels == 32 && operation->input_width == 80) {
+         task->input_line_stride *= 4;
+         task->input_surface_stride = (float)task->input_line_stride *
+                                      (((float)task->input_height / 4) - 1);
+      } else
+         task->input_surface_stride =
+            (float)task->input_line_stride * (((float)task->input_height) - 1);
+   } else {
+      task->input_line_stride = calc_line_stride(operation->input_width) / 4;
+      task->input_surface_stride =
+         (float)task->input_line_stride * (((float)task->input_height / 4) - 1);
+   }
+
+   if (task->input_width == 8 &&
+       (operation->addition_input || operation->add_tensor != -1)) {
+      task->input_line_stride /= 2;
+      task->input_surface_stride = 112;
+   }
+
+   int output_line_stride = calc_line_stride(operation->output_width);
+   task->output_surface_stride = output_line_stride * task->output_height;
+   task->output_surface_stride /= FEATURE_ATOMIC_SIZE;
+
+   if (task->input_channels_real == 1)
+      task->input_data_entries = task->input_width * task->input_height;
+   else if (task->input_width == 40 && task->input_channels_real == 40)
+      task->input_data_entries = 40;
+   else
+      task->input_data_entries = DIV_ROUND_UP(
+         task->input_width * 2 *
+            DIV_ROUND_UP(task->input_channels_real, FEATURE_ATOMIC_SIZE),
+         8);
+
+   task->weights_width = operation->weights_width;
+   task->weights_height = operation->weights_height;
+   task->weights_zero_point = operation->weights_zero_point;
+   task->weights_scale = operation->weights_scale;
+
+   if (operation->depthwise)
+      task->weights_kernels = 1;
+   else
+      task->weights_kernels = ALIGN(operation->output_channels, 2);
+
+   task->surfaces_per_row = task->output_width * task->output_height * 2;
+   if (operation->depthwise)
+      task->surfaces_per_row *= 2;
+}
+
+void
+rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
+                struct rkt_operation *operation)
+{
+   /* Function mostly taken from NVDLA */
+   unsigned entries_per_slice = calc_entries_per_slice(operation);
+   unsigned input_banks_required = calc_input_banks(operation);
+   unsigned weights_banks_required = calc_weights_banks(operation);
+   unsigned available_weights_banks = weights_banks_required;
+   unsigned available_input_banks = CBUF_BANKS - weights_banks_required;
+   unsigned pad_top;
+   unsigned pad_bottom;
+   unsigned pad_left;
+   unsigned pad_right;
+
+   calc_explicit_padding(operation, &pad_top, &pad_bottom, &pad_left,
+                         &pad_right);
+
+   if (weights_banks_required + 1 < CBUF_BANKS) {
+      /* Full weights, partial input */
+      operation->reuse_weights_cbuf = true;
+   } else {
+      /* Partial weights, partial input */
+      operation->reuse_weights_cbuf = false;
+      available_input_banks = 7;
+      available_weights_banks = CBUF_BANKS - available_input_banks;
+   }
+
+   if (input_banks_required <= available_input_banks) {
+      /* Full weights, full input */
+
+      struct split_task task = {0};
+
+      task.num = 0;
+      fill_task(subgraph, operation, &task);
+      task.input_banks = input_banks_required;
+      task.weights_banks = CBUF_BANKS - task.input_banks;
+      task.input_height = operation->input_height;
+
+      task.pad_top = pad_top;
+      task.pad_bottom = pad_bottom;
+      task.pad_left = pad_left;
+      task.pad_right = pad_right;
+
+      task.atomic_count = task.output_width * task.output_height;
+
+      util_dynarray_append(&operation->tasks, struct split_task, task);
+
+      return;
+   }
+
+   struct split_task task = {0};
+   unsigned available_slices =
+      (CBUF_ENTRIES_PER_BANK * available_input_banks) / entries_per_slice;
+
+   task.num = 0;
+   fill_task(subgraph, operation, &task);
+   task.input_banks = available_input_banks;
+   task.weights_banks = available_weights_banks;
+
+   task.top_slice = 0;
+   task.bottom_slice = available_slices - 1;
+
+   task.pad_top = pad_top;
+   task.pad_left = pad_left;
+   task.pad_right = pad_right;
+
+   util_dynarray_append(&operation->tasks, struct split_task, task);
+
+   for (unsigned slice = operation->weights_height - pad_top - 1;
+        slice < operation->input_height;) {
+      memset(&task, 0, sizeof(task));
+
+      struct split_task *prev_task = util_dynarray_element(
+         &operation->tasks, struct split_task,
+         util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
+
+      while (slice <= prev_task->bottom_slice) {
+         slice += operation->stride;
+      }
+      if (slice > prev_task->bottom_slice) {
+         slice -= operation->stride;
+      }
+
+      task.num = util_dynarray_num_elements(&operation->tasks, struct split_task);
+      fill_task(subgraph, operation, &task);
+      task.top_slice = MIN2(slice, prev_task->bottom_slice) -
+                       (operation->weights_height - 1) + operation->stride;
+      task.bottom_slice = task.top_slice + available_slices - 1;
+      task.pad_left = pad_left;
+      task.pad_right = pad_right;
+
+      // check if current task is the last one
+      if (task.bottom_slice >= operation->input_height - 1) {
+         task.bottom_slice = operation->input_height - 1;
+         task.pad_bottom = pad_bottom;
+         util_dynarray_append(&operation->tasks, struct split_task, task);
+         break;
+      }
+
+      slice = task.top_slice + operation->weights_height - 1;
+      util_dynarray_append(&operation->tasks, struct split_task, task);
+   }
+
+   struct split_task *last_task = util_dynarray_element(
+      &operation->tasks, struct split_task,
+      util_dynarray_num_elements(&operation->tasks, struct split_task) - 1);
+   if (last_task->top_slice >= operation->input_height ||
+       last_task->bottom_slice >= (operation->input_height + pad_bottom)) {
+      (void)util_dynarray_pop_ptr(&operation->tasks, struct split_task);
+   }
+
+   // determine overlap slices between 2 split chunks
+   for (int i = 1;
+        i < util_dynarray_num_elements(&operation->tasks, struct split_task);
+        i++) {
+      struct split_task *prev_task =
+         util_dynarray_element(&operation->tasks, struct split_task, i - 1);
+      struct split_task *cur_task =
+         util_dynarray_element(&operation->tasks, struct split_task, i);
+
+      if (prev_task->bottom_slice >= cur_task->top_slice) {
+         cur_task->num_overlap_slices =
+            prev_task->bottom_slice - cur_task->top_slice + 1;
+         prev_task->num_retain_slices = cur_task->num_overlap_slices;
+      } else {
+         cur_task->num_overlap_slices = 0;
+         prev_task->num_retain_slices = 0;
+      }
+   }
+
+   unsigned output_height_processed = 0;
+   for (int i = 0;
+        i < util_dynarray_num_elements(&operation->tasks, struct split_task);
+        i++) {
+      struct split_task *cur_task =
+         util_dynarray_element(&operation->tasks, struct split_task, i);
+
+      unsigned slice = cur_task->top_slice + (operation->weights_height - 1) -
+                       cur_task->pad_top;
+
+      while (slice <= cur_task->bottom_slice + cur_task->pad_bottom) {
+         slice += operation->stride;
+         cur_task->convolutions++;
+      }
+
+      cur_task->bottom_slice =
+         MIN2(cur_task->bottom_slice, operation->input_height - 1);
+
+      cur_task->input_height = cur_task->bottom_slice - cur_task->top_slice + 1;
+
+      cur_task->output_width = (cur_task->input_width + cur_task->pad_left +
+                                cur_task->pad_right - operation->weights_width) /
+                                  operation->stride +
+                               1;
+      cur_task->output_height =
+         (cur_task->input_height + cur_task->pad_top + cur_task->pad_bottom -
+          operation->weights_height) /
+            operation->stride +
+         1;
+      cur_task->atomic_count = cur_task->output_width * cur_task->output_height;
+
+      cur_task->input_offset =
+         calc_line_stride(operation->input_width) * cur_task->top_slice;
+      cur_task->output_offset =
+         calc_line_stride(operation->output_width) * output_height_processed;
+
+      cur_task->input_banks = available_input_banks;
+      cur_task->weights_banks = available_weights_banks;
+
+      output_height_processed += cur_task->output_height;
+   }
+}
--- a/src/gallium/drivers/rocket/rkt_task.h
+++ b/src/gallium/drivers/rocket/rkt_task.h
@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef RKT_TASK_H
+#define RKT_TASK_H
+
+#include "rkt_ml.h"
+
+void rkt_split_tasks(struct rkt_ml_subgraph *subgraph,
+                     struct rkt_operation *operation);
+
+#endif /* RKT_TASK_H */
--- a/src/gallium/drivers/rocket/rules-ng.xsd
+++ b/src/gallium/drivers/rocket/rules-ng.xsd
@ -0,0 +1,457 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<schema xmlns="http://www.w3.org/2001/XMLSchema"
+	targetNamespace="http://nouveau.freedesktop.org/"
+	xmlns:rng="http://nouveau.freedesktop.org/"
+	elementFormDefault="qualified">
+
+	<annotation>
+		<documentation>
+			An updated version of the old rules.xml file from the
+			RivaTV project. Specifications by Pekka Paalanen,
+			preliminary attempt by KoalaBR,
+			first working version by Jakob Bornecrantz.
+			For specifications, see the file rules-ng-format.txt
+			in Nouveau CVS module 'rules-ng'.
+		</documentation>
+		<documentation>Version 0.1</documentation>
+	</annotation>
+
+
+	<!-- Elements -->
+
+	<element name="database"       type="rng:databaseType" />
+	<element name="import"         type="rng:importType" />
+	<element name="copyright"      type="rng:copyrightType" />
+	<element name="domain"         type="rng:domainType" />
+	<element name="group"          type="rng:groupType" />
+	<element name="use-group"      type="rng:refType" />
+	<element name="array"          type="rng:arrayType" />
+	<element name="stripe"         type="rng:stripeType" />
+	<element name="reg64"          type="rng:registerType" />
+	<element name="reg32"          type="rng:registerType" />
+	<element name="reg16"          type="rng:registerType" />
+	<element name="reg8"           type="rng:registerType" />
+	<element name="bitset"         type="rng:bitsetType" />
+	<element name="bitfield"       type="rng:bitfieldType" />
+	<element name="enum"           type="rng:enumType" />
+	<element name="value"          type="rng:valueType" />
+
+	<!-- Copyright elements -->
+	<element name="author"         type="rng:authorType" />
+	<element name="nick"           type="rng:nickType" />
+	<element name="license"        type="rng:docType" />
+
+	<!-- Documentation elements -->
+	
+	<!-- FIXME: allowed only one  per parent element -->
+	<element name="brief" type="rng:briefType" />
+	
+	<element name="doc"  type="rng:docType" />
+	<element name="b"    type="rng:textformatType" />
+	<element name="i"    type="rng:textformatType" />
+	<element name="u"    type="rng:textformatType" />
+	<element name="code" type="rng:textcodeType" />
+	<element name="ul"   type="rng:listType" />
+	<element name="ol"   type="rng:listType" />
+	<element name="li"   type="rng:listitemType" />
+
+	<!-- Copyright element types -->
+
+	<complexType name="authorType" mixed="true">
+		<annotation>
+			<documentation>
+				register database author
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:nick" />
+		</choice>
+		<attribute name="name" type="string" use="required" />
+		<attribute name="email" type="string" use="required" />
+	</complexType>
+
+	<complexType name="nickType">
+		<annotation>
+			<documentation>nickType</documentation>
+		</annotation>
+		<attribute name="name" type="string" use="required" />
+	</complexType>
+
+	<!-- Database element types -->
+
+	<complexType name="databaseType">
+		<annotation>
+			<documentation>databaseType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+	</complexType>
+
+	<complexType name="importType">
+		<annotation>
+			<documentation>importType</documentation>
+		</annotation>
+		<attribute name="file" type="string" use="required" />
+	</complexType>
+
+	<complexType name="copyrightType">
+		<annotation>
+			<documentation>copyrightType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<element ref="rng:author" />
+			<element ref="rng:license" />
+		</choice>
+		<attribute name="year" type="nonNegativeInteger" use="optional" />
+	</complexType>
+
+	<complexType name="domainType">
+		<annotation>
+			<documentation>domainType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="width" type="rng:DomainWidth" use="optional" />
+		<attribute name="size" type="rng:HexOrNumber" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="groupType">
+		<annotation>
+			<documentation>groupType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+	</complexType>
+
+	<complexType name="arrayType">
+		<annotation>
+			<documentation>arrayType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="optional" />
+		<attribute name="offset" type="rng:HexOrNumber" use="optional" />
+		<attribute name="offsets" type="string" use="optional"/>
+		<attribute name="doffsets" type="string" use="optional"/>
+		<attribute name="index" type="NMTOKENS" use="optional"/>
+		<attribute name="stride" type="rng:HexOrNumber" use="required" />
+		<attribute name="length" type="rng:HexOrNumber" use="required" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="usage" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="stripeType">
+		<annotation>
+			<documentation>stripeType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<group ref="rng:regarrayGroup" minOccurs="0" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="optional" />
+		<attribute name="offset" type="rng:HexOrNumber" use="optional" />
+		<attribute name="stride" type="rng:HexOrNumber" use="optional" />
+		<attribute name="length" type="rng:HexOrNumber" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+	</complexType>
+
+	<complexType name="registerType">
+		<annotation>
+			<documentation>
+				registerType used by reg8, reg16, reg32, reg64
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+			<element ref="rng:value" />
+			<element ref="rng:bitfield" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="offset" type="rng:HexOrNumber" use="required" />
+		<attribute name="access" type="rng:Access" default="rw" use="optional" />
+		<attribute name="type" type="NMTOKENS" use="optional" />
+		<attribute name="shr" type="nonNegativeInteger" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="stride" type="rng:HexOrNumber" use="optional" />
+		<attribute name="length" type="rng:HexOrNumber" use="optional" />
+		<attribute name="high" type="nonNegativeInteger" use="optional" />
+		<attribute name="low" type="nonNegativeInteger" use="optional" />
+		<attribute name="pos" type="nonNegativeInteger" use="optional" />
+		<attribute name="align" type="nonNegativeInteger" use="optional" />
+		<attribute name="radix" type="nonNegativeInteger" use="optional" />
+		<attribute name="usage" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="bitsetType">
+		<annotation>
+			<documentation>bitsetType</documentation>
+		</annotation>
+		<choice maxOccurs="unbounded">
+			<element ref="rng:bitfield" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="inline" type="rng:Boolean" use="optional" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+	</complexType>
+
+	<complexType name="bitfieldType">
+		<annotation>
+			<documentation>bitfieldType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:value" maxOccurs="unbounded" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="high" type="nonNegativeInteger" use="optional" />
+		<attribute name="low" type="nonNegativeInteger" use="optional" />
+		<attribute name="pos" type="nonNegativeInteger" use="optional" />
+		<attribute name="radix" type="nonNegativeInteger" use="optional" />
+		<attribute name="align" type="nonNegativeInteger" use="optional" />
+		<attribute name="type" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+		<attribute name="addvariant" type="rng:Boolean" use="optional" />
+		<attribute name="shr" type="nonNegativeInteger" use="optional" />
+	</complexType>
+
+	<complexType name="enumType">
+		<annotation>
+			<documentation>enumType</documentation>
+		</annotation>
+		<choice maxOccurs="unbounded">
+			<element ref="rng:value" />
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="inline" type="rng:Boolean" use="optional" />
+		<attribute name="bare" type="rng:Boolean" use="optional" />
+		<attribute name="prefix" type="NMTOKENS" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+	</complexType>
+
+	<complexType name="valueType">
+		<annotation>
+			<documentation>valueType</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:docGroup" />
+			<group ref="rng:topGroup" />
+		</choice>
+		<attribute name="name" type="NMTOKEN" use="required" />
+		<attribute name="value" type="string" use="optional" />
+		<attribute name="varset" type="NMTOKEN" use="optional" />
+		<attribute name="variants" type="string" use="optional" />
+	</complexType>
+
+	<complexType name="refType">
+		<annotation>
+			<documentation>refType</documentation>
+		</annotation>
+		<attribute name="ref" type="NMTOKEN" use="required" />
+	</complexType>
+
+
+	<!-- Documentation element types -->
+
+	<complexType name="briefType">
+		<annotation>
+			<documentation>
+				brief documentation, no markup
+			</documentation>
+		</annotation>
+		<simpleContent>
+			<extension base="string" />
+		</simpleContent>
+	</complexType>
+	
+	<complexType name="docType" mixed="true">
+		<annotation>
+			<documentation>
+				root element of documentation sub-tree
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+			<group ref="rng:listGroup" />
+			<element ref="rng:code" />
+		</choice>
+	</complexType>
+	
+	<complexType name="textformatType" mixed="true">
+		<annotation>
+			<documentation>
+				for bold, underline, italics
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+		</choice>
+	</complexType>
+	
+	<complexType name="textcodeType">
+		<simpleContent>
+			<extension base="string">
+				<attribute name="title" type="string" />
+			</extension>
+		</simpleContent>
+	</complexType>
+	
+	<complexType name="listType">
+		<annotation>
+			<documentation>
+				definition of a list, ordered or unordered
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<element ref="rng:li" />
+		</choice>
+	</complexType>
+	
+	<complexType name="listitemType" mixed="true">
+		<annotation>
+			<documentation>
+				items of a list
+			</documentation>
+		</annotation>
+		<choice minOccurs="0" maxOccurs="unbounded">
+			<group ref="rng:textformatGroup" />
+			<group ref="rng:listGroup" />
+			<element ref="rng:code" />
+		</choice>
+	</complexType>
+
+
+
+	<!-- Attribute value types -->
+
+	<simpleType name="Hexadecimal">
+		<restriction base="string">
+			<pattern value="0x[0-9a-f]+" />
+			<pattern value="0x[0-9A-F]+" />
+			<pattern value="[0-9]" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="HexOrNumber">
+		<annotation>
+			<documentation>HexOrNumber</documentation>
+		</annotation>
+		<union memberTypes="rng:Hexadecimal nonNegativeInteger" />
+	</simpleType>
+
+	<simpleType name="Boolean">
+		<restriction base="string">
+			<enumeration value="true" />
+			<enumeration value="1" />
+			<enumeration value="yes" />
+			<enumeration value="false" />
+			<enumeration value="0" />
+			<enumeration value="no" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="Access">
+		<annotation>
+			<documentation>Access</documentation>
+		</annotation>
+		<restriction base="string">
+			<enumeration value="r" />
+			<enumeration value="w" />
+			<enumeration value="rw" />
+		</restriction>
+	</simpleType>
+
+	<simpleType name="DomainWidth">
+		<annotation>
+			<documentation>DomainWidth</documentation>
+		</annotation>
+		<restriction base="string">
+			<enumeration value="8" />
+			<enumeration value="16" />
+			<enumeration value="32" />
+			<enumeration value="64" />
+		</restriction>
+	</simpleType>
+
+
+
+	<!-- Element groups -->
+
+	<group name="topGroup">
+		<choice>
+			<element ref="rng:copyright" />
+			<element ref="rng:domain" />
+			<element ref="rng:enum" />
+			<element ref="rng:group" />
+			<element ref="rng:bitset" />
+			<element ref="rng:import" />
+		</choice>
+	</group>
+	
+	<group name="regarrayGroup">
+		<choice>
+			<element ref="rng:reg64" />
+			<element ref="rng:reg32" />
+			<element ref="rng:reg16" />
+			<element ref="rng:reg8" />
+			<element ref="rng:array" />
+			<element ref="rng:stripe" />
+			<element ref="rng:use-group" />
+		</choice>
+	</group>
+	
+	<group name="docGroup">
+		<choice>
+			<element ref="rng:brief" />
+			<element ref="rng:doc" />
+		</choice>
+	</group>
+	
+	<group name="textformatGroup">
+		<choice>
+			<element ref="rng:b" />
+			<element ref="rng:i" />
+			<element ref="rng:u" />
+		</choice>
+	</group>
+	
+	<group name="listGroup">
+		<choice>
+			<element ref="rng:ul" />
+			<element ref="rng:ol" />
+		</choice>
+	</group>
+
+</schema>
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@ -185,6 +185,12 @@ if with_gallium_lima
 else
  driver_lima = declare_dependency()
 endif
+if with_gallium_rocket
+  subdir('winsys/rocket/drm')
+  subdir('drivers/rocket')
+else
+  driver_rocket = declare_dependency()
+endif
 if with_gallium_zink
  subdir('drivers/zink')
 else
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@ -62,7 +62,7 @@ libgallium_dri = shared_library(
    driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
    driver_tegra, driver_i915, driver_svga, driver_virgl,
    driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
-    driver_asahi, driver_crocus
+    driver_asahi, driver_crocus, driver_rocket
  ],
  install : true,
  name_suffix : libname_suffix,
--- a/src/gallium/targets/dril/meson.build
+++ b/src/gallium/targets/dril/meson.build
@ -124,7 +124,8 @@ foreach d : [[with_gallium_kmsro, [
             [with_gallium_lima, 'lima_dri.so'],
             [with_gallium_d3d12, 'd3d12_dri.so'],
             [with_gallium_zink, 'zink_dri.so'],
-             [with_gallium_asahi, 'asahi_dri.so']]
+             [with_gallium_asahi, 'asahi_dri.so'],
+             [with_gallium_rocket, 'rocket_dri.so']]
  if d[0]
    dril_drivers += d[1]
  endif
--- a/src/gallium/winsys/rocket/drm/meson.build
+++ b/src/gallium/winsys/rocket/drm/meson.build
@ -0,0 +1,13 @@
+# Copyright 2017 Broadcom
+# SPDX-License-Identifier: MIT
+
+librocketwinsys = static_library(
+  'rocketwinsys',
+  files('rkt_drm_winsys.c'),
+  include_directories : [
+    inc_src, inc_include,
+    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+  ],
+  gnu_symbol_visibility : 'hidden',
+  dependencies: [idep_mesautil],
+)
--- a/src/gallium/winsys/rocket/drm/rkt_drm_public.h
+++ b/src/gallium/winsys/rocket/drm/rkt_drm_public.h
@ -0,0 +1,17 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __RKT_DRM_PUBLIC_H__
+#define __RKT_DRM_PUBLIC_H__
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+rkt_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
+
+#endif /* __RKT_DRM_PUBLIC_H__ */
--- a/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c
+++ b/src/gallium/winsys/rocket/drm/rkt_drm_winsys.c
@ -0,0 +1,19 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/os_file.h"
+#include "util/u_screen.h"
+
+#include "rocket/rkt_device.h"
+#include "rkt_drm_public.h"
+
+struct pipe_screen *
+rkt_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+   return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
+                                         rkt_screen_create);
+}