WIP: thames: initial commit

2026-05-06 09:28:07 +02:00 · 2025-10-28 12:39:12 +01:00 · 2025-10-28 12:39:12 +01:00 · 931683bab6
commit 931683bab6
parent 562bb8b62b
34 changed files with 1967 additions and 4 deletions
--- a/.clang-format-include
+++ b/.clang-format-include
@ -5,6 +5,7 @@ src/gallium/drivers/ethosu/**/*
 src/gallium/drivers/i915
 src/gallium/drivers/r300/compiler/*
 src/gallium/drivers/rocket/**/*
+src/gallium/drivers/thames/**/*
 src/gallium/targets/teflon/**/*
 src/gallium/frontends/teflon/**/*
 src/amd/vulkan/**/*
--- a/include/drm-uapi/thames_accel.h
+++ b/include/drm-uapi/thames_accel.h
@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com/ */
+#ifndef _THAMES_DRM_H_
+#define _THAMES_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * DOC: IOCTL IDs
+ *
+ * enum drm_thames_ioctl_id - IOCTL IDs
+ *
+ * Place new ioctls at the end, don't re-order, don't replace or remove entries.
+ *
+ * These IDs are not meant to be used directly. Use the DRM_IOCTL_THAMES_xxx
+ * definitions instead.
+ */
+enum drm_thames_ioctl_id {
+	/** @DRM_THAMES_BO_CREATE: Create a buffer object. */
+	DRM_THAMES_BO_CREATE,
+
+	/** @DRM_THAMES_BO_WAIT: Wait on a buffer object's fence. */
+	DRM_THAMES_BO_WAIT,
+
+	/**
+	 * @DRM_THAMES_BO_MMAP_OFFSET: Get the file offset to pass to
+	 * mmap to map a GEM object.
+	 */
+	DRM_THAMES_BO_MMAP_OFFSET,
+
+	/**
+	 * @DRM_THAMES_CMDSTREAM_BO_CREATE: Create a command stream buffer
+	 * object.
+	 */
+	DRM_THAMES_CMDSTREAM_BO_CREATE,
+
+	/** @DRM_THAMES_SUBMIT: Submit a job and BOs to run. */
+	DRM_THAMES_SUBMIT,
+};
+
+/**
+ * DOC: IOCTL arguments
+ */
+
+/**
+ * enum drm_thames_bo_flags - Buffer object flags, passed at creation time.
+ */
+enum drm_thames_bo_flags {
+	/**
+	 * @DRM_THAMES_BO_NO_MMAP: The buffer object will never be CPU-mapped
+	 * in userspace.
+	 */
+	DRM_THAMES_BO_NO_MMAP = (1 << 0),
+};
+
+/**
+ * struct drm_thames_bo_create - Arguments passed to DRM_IOCTL_THAMES_BO_CREATE.
+ */
+struct drm_thames_bo_create {
+	/**
+	 * @size: Requested size for the object
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+
+	/**
+	 * @flags: Flags. Must be a combination of drm_thames_bo_flags flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @handle: Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+};
+
+/**
+ * struct drm_thames_bo_mmap_offset - Arguments passed to DRM_IOCTL_THAMES_BO_MMAP_OFFSET.
+ */
+struct drm_thames_bo_mmap_offset {
+	/** @handle: Handle of the object we want an mmap offset for. */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @offset: The fake offset to use for subsequent mmap calls. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_thames_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_THAMES_SUBMIT on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_thames_bo_wait {
+	__u32 handle;
+	__u32 pad;
+	__s64 timeout_ns;	/* absolute */
+};
+
+
+struct drm_thames_cmdstream_bo_create {
+	/* Size of the data argument. */
+	__u32 size;
+
+	/* Flags, currently must be 0. */
+	__u32 flags;
+
+	/* Pointer to the data. */
+	__u64 data;
+
+	/** Returned GEM handle for the BO. */
+	__u32 handle;
+
+	/* Pad, must be 0. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_thames_job - A job to be run on the NPU
+ *
+ * The kernel will schedule the execution of this job taking into account its
+ * dependencies with other jobs. All tasks in the same job will be executed
+ * sequentially on the same core, to benefit from memory residency in SRAM.
+ */
+struct drm_thames_job {
+	/** Input: BO handle for kernel. */
+	__u32 kernel;
+
+	/** Input: Size in bytes of the compiled kernel. */
+	__u32 kernel_size;
+
+#define THAMES_MAX_REGIONS	8
+	/** Input: Array of BO handles for each region. */
+	__u32 region_bo_handles[THAMES_MAX_REGIONS];
+};
+
+/**
+ * struct drm_thames_submit - ioctl argument for submitting commands to the NPU.
+ *
+ * The kernel will schedule the execution of these jobs in dependency order.
+ */
+struct drm_thames_submit {
+	/** Input: Pointer to an array of struct drm_thames_job. */
+	__u64 jobs;
+
+	/** Input: Number of jobs passed in. */
+	__u32 job_count;
+
+	/** Reserved, must be zero. */
+	__u32 pad;
+};
+
+
+/**
+ * DRM_IOCTL_THAMES() - Build a thames IOCTL number
+ * @__access: Access type. Must be R, W or RW.
+ * @__id: One of the DRM_THAMES_xxx id.
+ * @__type: Suffix of the type being passed to the IOCTL.
+ *
+ * Don't use this macro directly, use the DRM_IOCTL_THAMES_xxx
+ * values instead.
+ *
+ * Return: An IOCTL number to be passed to ioctl() from userspace.
+ */
+#define DRM_IOCTL_THAMES(__access, __id, __type) \
+	DRM_IO ## __access(DRM_COMMAND_BASE + DRM_THAMES_ ## __id, \
+			   struct drm_thames_ ## __type)
+
+enum {
+	DRM_IOCTL_THAMES_BO_CREATE =
+		DRM_IOCTL_THAMES(WR, BO_CREATE, bo_create),
+	DRM_IOCTL_THAMES_BO_WAIT =
+		DRM_IOCTL_THAMES(WR, BO_WAIT, bo_wait),
+	DRM_IOCTL_THAMES_BO_MMAP_OFFSET =
+		DRM_IOCTL_THAMES(WR, BO_MMAP_OFFSET, bo_mmap_offset),
+	DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE =
+		DRM_IOCTL_THAMES(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
+	DRM_IOCTL_THAMES_SUBMIT =
+		DRM_IOCTL_THAMES(WR, SUBMIT, submit),
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _THAMES_DRM_H_ */
--- a/meson.build
+++ b/meson.build
@ -186,7 +186,7 @@ elif gallium_drivers.contains('all')
   gallium_drivers = [
     'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
     'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
-     'zink', 'd3d12', 'asahi', 'rocket', 'ethosu'
+     'zink', 'd3d12', 'asahi', 'rocket', 'ethosu', 'thames'
   ]
 endif

@ -215,6 +215,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
 with_gallium_asahi = gallium_drivers.contains('asahi')
 with_gallium_rocket = gallium_drivers.contains('rocket')
 with_gallium_ethosu = gallium_drivers.contains('ethosu')
+with_gallium_thames = gallium_drivers.contains('thames')
 foreach gallium_driver : gallium_drivers
  pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
--- a/meson.options
+++ b/meson.options
@ -88,7 +88,7 @@ option(
    'all', 'auto',
    'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
    'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
-    'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
+    'rocket', 'softpipe', 'svga', 'tegra', 'thames', 'v3d', 'vc4', 'virgl', 'zink',
  ],
  description : 'List of gallium drivers to build. If this is set to auto ' +
                'all drivers applicable to the target OS/architecture ' +
--- a/src/gallium/auxiliary/pipe-loader/meson.build
+++ b/src/gallium/auxiliary/pipe-loader/meson.build
@ -45,6 +45,9 @@ endif
 if with_gallium_ethosu
  renderonly_drivers_c_args += '-DGALLIUM_ETHOSU'
 endif
+if with_gallium_thames
+  renderonly_drivers_c_args += '-DGALLIUM_THAMES'
+endif

 libpipe_loader_static = static_library(
  'pipe_loader_static',
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@ -87,6 +87,7 @@ static const struct drm_driver_descriptor *driver_descriptors[] = {
   &rocket_driver_descriptor,
   &ethosu_driver_descriptor,
   &tegra_driver_descriptor,
+   &thames_driver_descriptor,
   &lima_driver_descriptor,
   &zink_driver_descriptor,
 };
@ -383,6 +384,9 @@ pipe_loader_get_compatible_render_capable_device_fds(int kms_only_fd, unsigned i
 #if defined GALLIUM_ETHOSU
      "ethosu",
 #endif
+#if defined GALLIUM_THAMES
+      "thames",
+#endif
 #if defined GALLIUM_V3D
      "v3d",
 #endif
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@ -53,6 +53,7 @@ const struct drm_driver_descriptor descriptor_name = {         \
 #undef GALLIUM_ASAHI
 #undef GALLIUM_ROCKET
 #undef GALLIUM_ETHOSU
+#undef GALLIUM_THAMES
 #endif

 #ifdef GALLIUM_I915
@ -480,6 +481,24 @@ DRM_DRIVER_DESCRIPTOR(ethosu, NULL, 0)
 DRM_DRIVER_DESCRIPTOR_STUB(ethosu)
 #endif

+#ifdef GALLIUM_THAMES
+#include "thames/drm/thames_drm_public.h"
+
+static struct pipe_screen *
+pipe_thames_create_screen(int fd, const struct pipe_screen_config *config)
+{
+   struct pipe_screen *screen;
+
+   screen = thames_drm_screen_create(fd, config);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+DRM_DRIVER_DESCRIPTOR(thames, NULL, 0)
+
+#else
+DRM_DRIVER_DESCRIPTOR_STUB(thames)
+#endif
+
 #ifdef GALLIUM_KMSRO
 #include "kmsro/drm/kmsro_drm_public.h"

--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@ -25,6 +25,7 @@ extern const struct drm_driver_descriptor rknpu_driver_descriptor;
 extern const struct drm_driver_descriptor rocket_driver_descriptor;
 extern const struct drm_driver_descriptor ethosu_driver_descriptor;
 extern const struct drm_driver_descriptor tegra_driver_descriptor;
+extern const struct drm_driver_descriptor thames_driver_descriptor;
 extern const struct drm_driver_descriptor lima_driver_descriptor;
 extern const struct drm_driver_descriptor zink_driver_descriptor;
 extern const struct drm_driver_descriptor kmsro_driver_descriptor;
--- a/src/gallium/drivers/thames/.clang-format
+++ b/src/gallium/drivers/thames/.clang-format
@ -0,0 +1,2 @@
+BasedOnStyle: InheritParentConfig
+DisableFormat: false
--- a/src/gallium/drivers/thames/ci/thames-j722s-fails.txt
+++ b/src/gallium/drivers/thames/ci/thames-j722s-fails.txt
--- a/src/gallium/drivers/thames/ci/thames-j722s-flakes.txt
+++ b/src/gallium/drivers/thames/ci/thames-j722s-flakes.txt
--- a/src/gallium/drivers/thames/ci/thames-j722s-skips.txt
+++ b/src/gallium/drivers/thames/ci/thames-j722s-skips.txt
@ -0,0 +1,14 @@
+Add.Op/.*
+AddQuant.Op/.*
+Conv2D.Op/.*
+DepthwiseConv2D.Op/.*
+FullyConnected.Op/.*
+
+# Don't support unfused Pad operations yet
+Models.Op/yolox_000
+Models.Op/yolox_003
+Models.Op/yolox_012
+Models.Op/yolox_027
+Models.Op/yolox_042
+Models.Op/yolox_077
+Models.Op/yolox_086
--- a/src/gallium/drivers/thames/meson.build
+++ b/src/gallium/drivers/thames/meson.build
@ -0,0 +1,32 @@
+# Copyright 2019 Google, Inc
+# SPDX-License-Identifier: MIT
+
+# thames_registers = custom_target(
+#   'thames_registers.h',
+#   input : ['gen_parser.py', 'gen_header.py', 'registers.xml'],
+#   output : 'thames_registers.h',
+#   command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'],
+#   capture : true,
+# )
+
+files_thames = files(
+  'thames_cmd.c',
+  'thames_coefs.c',
+  'thames_device.c',
+  'thames_lower.c',
+  'thames_ml.c',
+  'thames_sched.c',
+)
+
+libthames = static_library(
+  'thames',
+  [files_thames], #, thames_registers],
+  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [idep_mesautil, dep_libdrm],
+)
+
+driver_thames = declare_dependency(
+  compile_args : '-DGALLIUM_THAMES',
+  link_with : [libthameswinsys, libthames]
+)
--- a/src/gallium/drivers/thames/thames_cmd.c
+++ b/src/gallium/drivers/thames/thames_cmd.c
@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <fcntl.h>
+#include <math.h>
+#include <stdbool.h>
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+
+#include "thames_cmd.h"
+#include "thames_coefs.h"
+#include "thames_ml.h"
+#include "thames_sched.h"
+
+void
+thames_emit_cmdstream(struct thames_subgraph *subgraph)
+{
+
+}
--- a/src/gallium/drivers/thames/thames_cmd.h
+++ b/src/gallium/drivers/thames/thames_cmd.h
@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef THAMES_CMD_H
+#define THAMES_CMD_H
+
+#include "thames_ml.h"
+
+void thames_emit_cmdstream(struct thames_subgraph *subgraph);
+
+#endif /* THAMES_CMD_H */
--- a/src/gallium/drivers/thames/thames_coefs.c
+++ b/src/gallium/drivers/thames/thames_coefs.c
@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/u_inlines.h"
+
+#include "thames_coefs.h"
+
+void
+thames_fill_coefs(struct thames_subgraph *subgraph,
+           struct thames_operation *operation,
+           struct pipe_resource *bias_rsrc,
+           struct pipe_resource *weight_rsrc)
+{
+
+}
--- a/src/gallium/drivers/thames/thames_coefs.h
+++ b/src/gallium/drivers/thames/thames_coefs.h
@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef THAMES_COEFS_H
+#define THAMES_COEFS_H
+
+#include "thames_ml.h"
+
+void
+thames_fill_coefs(struct thames_subgraph *subgraph,
+                  struct thames_operation *operation,
+                  struct pipe_resource *bias_rsrc,
+                  struct pipe_resource *weight_rsrc);
+
+#endif /* THAMES_COEFS_H */
--- a/src/gallium/drivers/thames/thames_device.c
+++ b/src/gallium/drivers/thames/thames_device.c
@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "thames_device.h"
+#include "thames_ml.h"
+
+#include "drm-uapi/thames_accel.h"
+
+#include <xf86drm.h>
+#include "util/os_mman.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/u_transfer.h"
+
+static const struct debug_named_value thames_debug_options[] = {
+   {"dbg_msgs", THAMES_DBG_MSGS, "Print debug messages"},
+   {"dump_bos", THAMES_DBG_DUMP_BOS, "Dump buffers for analysis"},
+   {"zero_bos", THAMES_DBG_ZERO, "Zero buffers for debugging"},
+   DEBUG_NAMED_VALUE_END};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(thames_debug, "THAMES_DEBUG", thames_debug_options, 0)
+int thames_debug = 0;
+
+static void
+thames_destroy_screen(struct pipe_screen *pscreen)
+{
+   struct thames_screen *screen = thames_screen(pscreen);
+
+   ralloc_free(screen);
+}
+
+static void
+thames_destroy_context(struct pipe_context *pctx)
+{
+   struct thames_context *ctx = thames_context(pctx);
+
+   ralloc_free(ctx);
+}
+
+static void *
+thames_buffer_map(struct pipe_context *pctx,
+                  struct pipe_resource *prsc, unsigned level,
+                  unsigned usage, const struct pipe_box *box,
+                  struct pipe_transfer **out_transfer)
+{
+   struct thames_screen *screen = thames_screen(pctx->screen);
+   struct thames_resource *rsc = thames_resource(prsc);
+   struct drm_thames_bo_wait bo_wait = {0};
+   struct drm_thames_bo_mmap_offset bo_mmap_offset = {0};
+   int ret;
+
+   assert(level == 0);
+   assert(prsc->target == PIPE_BUFFER);
+   assert(box->y == 0);
+   assert(box->z == 0);
+   assert(box->height == 1);
+   assert(box->depth == 1);
+
+   struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
+   transfer->level = level;
+   transfer->usage = usage;
+   transfer->box = *box;
+
+   pipe_resource_reference(&transfer->resource, prsc);
+
+   bo_wait.handle = rsc->handle;
+   bo_wait.timeout_ns = INT64_MAX;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_WAIT, &bo_wait);
+   if (ret == -1)
+      goto free_transfer;
+
+   bo_mmap_offset.handle = rsc->handle;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_MMAP_OFFSET, &bo_mmap_offset);
+   if (ret == -1)
+      goto free_transfer;
+
+   uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          screen->fd, bo_mmap_offset.offset);
+   assert(map != MAP_FAILED);
+   if (map == MAP_FAILED)
+      goto free_transfer;
+
+   *out_transfer = transfer;
+
+   return map + box->x;
+
+free_transfer:
+   pipe_resource_reference(&transfer->resource, NULL);
+   ralloc_free(transfer);
+   return NULL;
+}
+
+static void
+thames_buffer_unmap(struct pipe_context *pctx,
+                    struct pipe_transfer *transfer)
+{
+   pipe_resource_reference(&transfer->resource, NULL);
+   ralloc_free(transfer);
+}
+
+static struct pipe_context *
+thames_create_context(struct pipe_screen *screen,
+                      void *priv, unsigned flags)
+{
+   struct thames_context *ctx = rzalloc(NULL, struct thames_context);
+   struct pipe_context *pctx = &ctx->base;
+
+   if (!ctx)
+      return NULL;
+
+   pctx->screen = screen;
+   pctx->priv = priv;
+
+   pctx->destroy = thames_destroy_context;
+
+   pctx->buffer_map = thames_buffer_map;
+   pctx->buffer_unmap = thames_buffer_unmap;
+   pctx->resource_copy_region = util_resource_copy_region;
+   pctx->buffer_subdata = u_default_buffer_subdata;
+   pctx->clear_buffer = u_default_clear_buffer;
+
+   pctx->ml_operation_supported = thames_ml_operation_supported;
+   pctx->ml_subgraph_create = thames_ml_subgraph_create;
+   pctx->ml_subgraph_invoke = thames_ml_subgraph_invoke;
+   pctx->ml_subgraph_read_output = thames_ml_subgraph_read_outputs;
+   pctx->ml_subgraph_destroy = thames_ml_subgraph_destroy;
+
+   return pctx;
+}
+
+static struct pipe_resource *
+thames_resource_create(struct pipe_screen *pscreen,
+                       const struct pipe_resource *templat)
+{
+   struct thames_screen *screen = thames_screen(pscreen);
+   struct drm_thames_bo_create arg = {0};
+   struct thames_resource *rsc;
+   int ret;
+
+   assert(templat->target == PIPE_BUFFER);
+   assert(templat->height0 == 1);
+   assert(templat->depth0 == 1);
+   assert(templat->array_size == 1);
+
+   rsc = rzalloc(NULL, struct thames_resource);
+   if (!rsc)
+      return NULL;
+
+   rsc->base = *templat;
+   rsc->base.screen = pscreen;
+   rsc->base.nr_samples = templat->nr_samples;
+   pipe_reference_init(&rsc->base.reference, 1);
+
+   rsc->bo_size = templat->width0;
+
+   arg.size = templat->width0;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_CREATE, &arg);
+   if (ret < 0)
+      goto free_rsc;
+
+   rsc->handle = arg.handle;
+
+   return &rsc->base;
+
+free_rsc:
+   ralloc_free(rsc);
+   return NULL;
+}
+
+static void
+thames_resource_destroy(struct pipe_screen *pscreen,
+                        struct pipe_resource *prsc)
+{
+   struct thames_resource *rsc = thames_resource(prsc);
+   struct thames_screen *screen = thames_screen(pscreen);
+   struct drm_gem_close arg = {0};
+   int ret;
+
+   arg.handle = rsc->handle;
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
+   assert(ret >= 0);
+
+   ralloc_free(rsc);
+}
+
+static int
+thames_screen_get_fd(struct pipe_screen *pscreen)
+{
+   return thames_screen(pscreen)->fd;
+}
+
+struct pipe_screen *
+thames_screen_create(int fd,
+                     const struct pipe_screen_config *config,
+                     struct renderonly *ro)
+{
+   struct thames_screen *thames_screen;
+   struct pipe_screen *screen;
+
+   thames_screen = rzalloc(NULL, struct thames_screen);
+   if (!thames_screen)
+      return NULL;
+
+   screen = &thames_screen->pscreen;
+
+   thames_debug = debug_get_option_thames_debug();
+
+   thames_screen->fd = fd;
+
+   screen->get_screen_fd = thames_screen_get_fd;
+   screen->destroy = thames_destroy_screen;
+   screen->context_create = thames_create_context;
+   screen->resource_create = thames_resource_create;
+   screen->resource_destroy = thames_resource_destroy;
+
+   return screen;
+}
--- a/src/gallium/drivers/thames/thames_device.h
+++ b/src/gallium/drivers/thames/thames_device.h
@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "renderonly/renderonly.h"
+#include "util/log.h"
+
+#include "drm-uapi/thames_accel.h"
+
+#ifndef THAMES_SCREEN_H
+#define THAMES_SCREEN_H
+
+enum thames_dbg {
+   THAMES_DBG_MSGS = BITFIELD_BIT(0),
+   THAMES_DBG_DUMP_BOS = BITFIELD_BIT(1),
+   THAMES_DBG_ZERO = BITFIELD_BIT(2),
+};
+
+extern int thames_debug;
+
+#define DBG_ENABLED(flag) unlikely(thames_debug &(flag))
+
+#define DBG(fmt, ...)                                 \
+   do {                                               \
+      if (DBG_ENABLED(THAMES_DBG_MSGS))               \
+         mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
+                   ##__VA_ARGS__);                    \
+   } while (0)
+
+struct thames_screen {
+   struct pipe_screen pscreen;
+
+   int fd;
+};
+
+static inline struct thames_screen *
+thames_screen(struct pipe_screen *p)
+{
+   return (struct thames_screen *)p;
+}
+
+struct thames_context {
+   struct pipe_context base;
+};
+
+static inline struct thames_context *
+thames_context(struct pipe_context *pctx)
+{
+   return (struct thames_context *)pctx;
+}
+
+struct thames_resource {
+   struct pipe_resource base;
+
+   uint32_t handle;
+   uint64_t bo_size;
+};
+
+static inline struct thames_resource *
+thames_resource(struct pipe_resource *p)
+{
+   return (struct thames_resource *)p;
+}
+
+struct pipe_screen *thames_screen_create(int fd,
+                                         const struct pipe_screen_config *config,
+                                         struct renderonly *ro);
+
+#endif /* THAMES_SCREEN_H */
--- a/src/gallium/drivers/thames/thames_kernel_bin.h
+++ b/src/gallium/drivers/thames/thames_kernel_bin.h
@ -0,0 +1 @@
+0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x0
--- a/src/gallium/drivers/thames/thames_lower.c
+++ b/src/gallium/drivers/thames/thames_lower.c
@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "thames_lower.h"
+#include "thames_coefs.h"
+#include "thames_sched.h"
+
+static bool
+is_depthwise(const struct pipe_ml_operation *poperation)
+{
+   unsigned input_channels = poperation->input_tensors[0]->dims[3];
+   unsigned output_channels = poperation->output_tensors[0]->dims[3];
+
+   return poperation->conv.depthwise && input_channels > 1 &&
+          output_channels > 1;
+}
+
+static unsigned
+needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size)
+{
+   if (input_size % stride == 0)
+      return MAX2(filter_size - stride, 0);
+
+   return MAX2(filter_size - (input_size % stride), 0);
+}
+
+static bool
+thames_is_part_kernel_first(struct thames_operation *operation)
+{
+   // Determine which block traversal strategy has better DPU utilization
+   unsigned kernel_size = operation->kernel.height * operation->kernel.width;
+   unsigned depth = operation->ifm.shape.depth;
+   float depth_utilization = (float)depth / thames_round_up_to_multiple(depth, 32);
+   float part_kernel_utilization = ((float)depth / thames_round_up_to_multiple(depth, 8));
+   part_kernel_utilization *= (float)kernel_size / thames_round_up_to_multiple(kernel_size, 4);
+
+   if (operation->type != THAMES_OPERATION_TYPE_CONVOLUTION)
+      return false;
+
+   if (operation->kernel.depthwise)
+      return false;
+
+   // Part-kernel first is always better for ifm depths <= 8
+   if (part_kernel_utilization >= depth_utilization || depth <= 8)
+      return true;
+
+   return false;
+}
+
+static void
+set_feature_maps(struct pipe_tensor *input_tensor,
+                 struct pipe_tensor *output_tensor,
+                 struct thames_operation *operation)
+{
+   operation->ifm.tensor_idx = input_tensor->index;
+   operation->ifm.shape.height = input_tensor->dims[1];
+   operation->ifm.shape.width = input_tensor->dims[2];
+   operation->ifm.shape.depth = input_tensor->dims[3];
+   operation->ifm.zero_point = input_tensor->zero_point;
+   operation->ifm.scale = input_tensor->scale;
+   operation->ifm.is_signed = input_tensor->is_signed;
+
+   operation->ofm.tensor_idx = output_tensor->index;
+   operation->ofm.shape.height = output_tensor->dims[1];
+   operation->ofm.shape.width = output_tensor->dims[2];
+   operation->ofm.shape.depth = output_tensor->dims[3];
+   operation->ofm.zero_point = output_tensor->zero_point;
+   operation->ofm.scale = output_tensor->scale;
+   operation->ofm.is_signed = output_tensor->is_signed;
+}
+
+static const struct pipe_ml_operation *
+thames_find_first_consumer(const struct pipe_ml_operation *poperations,
+                           unsigned count,
+                           unsigned tensor_index)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+      for (unsigned j = 0; j < poperation->input_count; j++)
+         if (poperation->input_tensors[j]->index == tensor_index)
+            return poperation;
+   }
+
+   return NULL;
+}
+
+static void
+allocate_feature_maps(struct thames_subgraph *subgraph, struct thames_operation *operation)
+{
+   thames_allocate_feature_map(subgraph, &operation->ifm);
+   operation->ifm.tiles.height_0 = operation->ifm.shape.height;
+   operation->ifm.tiles.height_1 = operation->ifm.shape.height;
+   operation->ifm.tiles.width_0 = operation->ifm.shape.width;
+
+   thames_allocate_feature_map(subgraph, &operation->ofm);
+   operation->ofm.tiles.height_0 = operation->ofm.shape.height;
+   operation->ofm.tiles.height_1 = operation->ofm.shape.height;
+   operation->ofm.tiles.width_0 = operation->ofm.shape.width;
+}
+
+static const struct pipe_ml_operation *
+thames_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count,
+                           unsigned tensor_index)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+
+      for (unsigned j = 0; j < poperation->output_count; j++) {
+         if (poperation->output_tensors[j]->index == tensor_index)
+            return poperation;
+      }
+   }
+
+   return NULL;
+}
+
+static void
+thames_lower_convolution(struct thames_subgraph *subgraph,
+                         const struct pipe_ml_operation *poperation,
+                         struct pipe_tensor *input_tensor,
+                         struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_CONVOLUTION;
+
+   operation->conv.depthwise = is_depthwise(poperation);
+   // operation->padding_same = poperation->conv.padding_same;
+   // operation->stride = poperation->conv.stride_x;
+
+   set_feature_maps(input_tensor, poperation->output_tensors[0], operation);
+
+   operation->kernel.height = poperation->conv.weight_tensor->dims[1];
+   operation->kernel.width = poperation->conv.weight_tensor->dims[2];
+   operation->kernel.stride_y = poperation->conv.stride_y;
+   operation->kernel.stride_x = poperation->conv.stride_x;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+   operation->kernel.depthwise = is_depthwise(poperation);
+   operation->kernel.scale = poperation->conv.weight_tensor->scale;
+   operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point;
+   operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed;
+
+   operation->conv.part_kernel_first = thames_is_part_kernel_first(operation);
+
+   if (poperation->conv.padding_same) {
+      unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]);
+      unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]);
+
+      operation->pad.top = vert / 2;
+      operation->pad.left = horiz / 2;
+      operation->pad.bottom = (vert + 1) / 2;
+      operation->pad.right = (horiz + 1) / 2;
+   } else {
+      operation->pad.top = 0;
+      operation->pad.left = 0;
+      operation->pad.bottom = 0;
+      operation->pad.right = 0;
+   }
+
+   allocate_feature_maps(subgraph, operation);
+
+   thames_sched_operation(subgraph, operation);
+   thames_fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource);
+}
+
+static void
+thames_lower_pooling(struct thames_subgraph *subgraph,
+                     const struct pipe_ml_operation *poperation,
+                     struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+
+   operation->kernel.height = poperation->pooling.filter_height;
+   operation->kernel.width = poperation->pooling.filter_width;
+   operation->kernel.stride_y = poperation->pooling.stride_y;
+   operation->kernel.stride_x = poperation->pooling.stride_x;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   if (poperation->pooling.padding_same) {
+      unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height);
+      unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width);
+
+      operation->pad.top = vert / 2;
+      operation->pad.left = horiz / 2;
+      operation->pad.bottom = (vert + 1) / 2;
+      operation->pad.right = (horiz + 1) / 2;
+   } else {
+      operation->pad.top = 0;
+      operation->pad.left = 0;
+      operation->pad.bottom = 0;
+      operation->pad.right = 0;
+   }
+
+   allocate_feature_maps(subgraph, operation);
+   thames_sched_operation(subgraph, operation);
+}
+
+static void
+thames_lower_concatenation(struct thames_subgraph *subgraph,
+                           const struct pipe_ml_operation *poperation,
+                           unsigned input_idx,
+                           struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation);
+   operation->ofm.shape.depth = operation->ifm.shape.depth;
+
+   operation->round_mode = THAMES_ROUNDING_NATURAL;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+   for (unsigned i = 0; i < input_idx; i++) {
+      struct thames_tensor *tensor = thames_find_tensor(subgraph, operation->ofm.tensor_idx);
+
+      if (tensor->layout == THAMES_LAYOUT_NHWC)
+         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3];
+      else if (tensor->layout == THAMES_LAYOUT_NHCWB16)
+         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16);
+      else
+         assert(0 && "Unsupported layout");
+   }
+
+   thames_sched_operation(subgraph, operation);
+}
+
+static void
+thames_lower_resize(struct thames_subgraph *subgraph,
+                    const struct pipe_ml_operation *poperation,
+                    struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+   operation->ifm.zero_point = 0;
+   operation->ofm.zero_point = 0;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   operation->upscale = true;
+
+   allocate_feature_maps(subgraph, operation);
+   thames_sched_operation(subgraph, operation);
+}
+
+static void
+thames_lower_strided_slice(struct thames_subgraph *subgraph,
+                           const struct pipe_ml_operation *poperation,
+                           struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_POOLING;
+   operation->pooling.avg = true;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+   operation->ifm.shape = operation->ofm.shape;
+   operation->ifm.zero_point = 0;
+   operation->ofm.zero_point = 0;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+
+   unsigned augmented_coord[5];
+   augmented_coord[0] = 0;
+   for (int i = 0; i < 4; ++i) {
+      augmented_coord[i + 1] = poperation->slice.begin[i];
+   }
+
+   unsigned augmented_strides[5];
+   augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height;
+   augmented_strides[1] = 1;
+   augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width;
+   augmented_strides[3] = operation->ifm.shape.depth;
+   augmented_strides[4] = 1;
+
+   unsigned address_offset = 0;
+   for (int i = 0; i < 5; ++i)
+      address_offset += augmented_coord[i] * augmented_strides[i];
+
+   operation->ifm.tiles.addresses[0] += address_offset;
+
+   thames_sched_operation(subgraph, operation);
+}
+
+static void
+thames_lower_add(struct thames_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation,
+                 struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_ELTWISE;
+
+   set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
+
+   operation->ifm2.tensor_idx = poperation->input_tensors[1]->index;
+   operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1];
+   operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2];
+   operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3];
+   operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point;
+   operation->ifm2.scale = poperation->input_tensors[1]->scale;
+   operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed;
+
+   operation->kernel.height = 1;
+   operation->kernel.width = 1;
+   operation->kernel.stride_y = 1;
+   operation->kernel.stride_x = 1;
+   operation->kernel.dilation_y = 1;
+   operation->kernel.dilation_x = 1;
+
+   allocate_feature_maps(subgraph, operation);
+
+   thames_allocate_feature_map(subgraph, &operation->ifm2);
+   operation->ifm2.tiles.height_0 = operation->ifm2.shape.height;
+   operation->ifm2.tiles.height_1 = operation->ifm2.shape.height;
+   operation->ifm2.tiles.width_0 = operation->ifm2.shape.width;
+
+   thames_sched_operation(subgraph, operation);
+}
+
+static void
+thames_lower_dma(struct thames_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperation,
+                 struct thames_operation *conv_operation,
+                 struct thames_operation *operation)
+{
+   operation->type = THAMES_OPERATION_TYPE_DMA;
+
+   operation->dma.address = conv_operation->conv.scales.address;
+   operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size;
+
+   conv_operation->conv.scales.region = SCRATCH_REGION;
+   conv_operation->conv.scales.address = 0;
+
+   conv_operation->conv.weights.region = SCRATCH_REGION;
+   conv_operation->conv.weights.address = conv_operation->conv.scales.size;
+}
+
+static void
+register_tensors(struct thames_subgraph *subgraph,
+                 const struct pipe_ml_operation *poperations,
+                 unsigned count)
+{
+   for (unsigned i = 0; i < count; i++) {
+      const struct pipe_ml_operation *poperation = &poperations[i];
+
+      for (unsigned j = 0; j < poperation->input_count; j++) {
+         struct pipe_tensor *ptensor = poperation->input_tensors[j];
+         thames_register_tensor(subgraph, ptensor);
+      }
+
+      for (unsigned j = 0; j < poperation->output_count; j++) {
+         struct pipe_tensor *ptensor = poperation->output_tensors[j];
+         thames_register_tensor(subgraph, ptensor);
+      }
+   }
+}
+
+void
+thames_lower_graph(struct thames_subgraph *subgraph,
+                   const struct pipe_ml_operation *poperations, unsigned count)
+{
+   register_tensors(subgraph, poperations, count);
+
+   /* Lower */
+   for (int i = 0; i < count; i++) {
+      struct thames_operation operation = {0};
+
+      switch (poperations[i].type) {
+
+      case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+         struct pipe_tensor *input_tensor = poperations[i].input_tensors[0];
+         const struct pipe_ml_operation *producer = thames_find_first_producer(poperations, count, input_tensor->index);
+         bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD;
+
+         if (padded_input) {
+            input_tensor = producer->input_tensors[0];
+         }
+
+         thames_lower_convolution(subgraph, &poperations[i], input_tensor, &operation);
+
+         if (padded_input) {
+            operation.pad.top = 1;
+            operation.pad.left = 1;
+         }
+
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_ADD: {
+         thames_lower_add(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_POOLING: {
+         thames_lower_pooling(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: {
+         thames_lower_strided_slice(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
+         for (int j = 0; j < poperations[i].input_count; j++) {
+            thames_lower_concatenation(subgraph, &poperations[i], j, &operation);
+            util_dynarray_append(&subgraph->operations, operation);
+         }
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_RESIZE: {
+         thames_lower_resize(subgraph, &poperations[i], &operation);
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
+      case PIPE_ML_OPERATION_TYPE_PAD: {
+         // Just ignore the pad operation for now, as it will be handled by its consumers
+         break;
+      }
+
+      default:
+         DBG("poperation->type %d\n", poperations[i].type);
+         UNREACHABLE("Unsupported ML operation type");
+      }
+   }
+}
--- a/src/gallium/drivers/thames/thames_lower.h
+++ b/src/gallium/drivers/thames/thames_lower.h
@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef THAMES_LOWER_H
+#define THAMES_LOWER_H
+
+#include "thames_ml.h"
+
+void
+thames_lower_graph(struct thames_subgraph *subgraph,
+                   const struct pipe_ml_operation *poperations, unsigned count);
+
+#endif /* THAMES_LOWER_H */
--- a/src/gallium/drivers/thames/thames_ml.c
+++ b/src/gallium/drivers/thames/thames_ml.c
@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "util/u_inlines.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <xf86drm.h>
+
+#include "drm-uapi/thames_accel.h"
+
+#include "thames_cmd.h"
+#include "thames_lower.h"
+#include "thames_ml.h"
+
+void
+thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                   int suboperation_nr, int offset, unsigned size)
+{
+   char buffer[255];
+
+   snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
+            suboperation_nr);
+
+   FILE *f = fopen(buffer, "wb");
+   assert(f);
+   fwrite(ptr + offset, 1, size, f);
+   if (ferror(f)) {
+      DBG("Error in writing to file: %s\n", strerror(errno));
+   }
+   fflush(f);
+   fclose(f);
+}
+
+void
+thames_register_tensor(struct thames_subgraph *subgraph,
+                       const struct pipe_tensor *ptensor)
+{
+   struct thames_tensor new_tensor = {0};
+   new_tensor.index = ptensor->index;
+   new_tensor.shape.height = ptensor->dims[1];
+   new_tensor.shape.width = ptensor->dims[2];
+   new_tensor.shape.depth = ptensor->dims[3];
+   new_tensor.layout = THAMES_LAYOUT_NHWC;
+   util_dynarray_append(&subgraph->tensors, new_tensor);
+}
+
+void
+thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map)
+{
+   struct thames_tensor *tensor = thames_find_tensor(subgraph, feature_map->tensor_idx);
+   unsigned size;
+
+   if (tensor->layout == THAMES_LAYOUT_NHWC) {
+      size = tensor->shape.width * tensor->shape.height * tensor->shape.depth;
+   } else if (tensor->layout == THAMES_LAYOUT_NHCWB16) {
+      size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16);
+   } else {
+      assert(0 && "Unsupported layout");
+      size = 0; // This should never happen
+   }
+
+   assert(tensor);
+
+   if (tensor->size > 0) {
+      feature_map->tiles.addresses[0] = tensor->offset;
+      return;
+   }
+
+   tensor->offset = subgraph->io_used;
+   tensor->size = size;
+   subgraph->io_used += ALIGN_POT(size, 16);
+
+   feature_map->tiles.addresses[0] = tensor->offset;
+}
+
+struct thames_tensor *
+thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx)
+{
+   util_dynarray_foreach (&subgraph->tensors, struct thames_tensor, tensor) {
+      if (tensor->index == tensor_idx) {
+         return tensor;
+      }
+   }
+   return NULL;
+}
+
+int
+thames_round_up_to_multiple(int a, int b)
+{
+   return ((a + b - 1) / b) * b;
+}
+
+int
+thames_round_up_divide(int a, int b)
+{
+   return (a + b - 1) / b;
+}
+
+int
+thames_quantize_scale(double scale, uint32_t *shift)
+{
+   int exponent = 0;
+   double significand = frexp(scale, &exponent);
+   uint32_t quantized_scale = round(significand * (double)(1LL << 31));
+   *shift = 31 - exponent;
+   if (*shift > 63) {
+      if (quantized_scale > exp2(*shift - 63)) {
+         quantized_scale = quantized_scale >> (*shift - 63);
+         *shift = 63;
+      } else {
+         // Not possible to get back within bounds, set scale and shift to 0
+         // as the shift would shift away all relevant bits anyway.
+         quantized_scale = 0;
+         *shift = 0;
+      }
+   } else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) {
+      quantized_scale = quantized_scale << (0 - *shift);
+      *shift = 0;
+   }
+
+   return quantized_scale;
+}
+
+static bool
+tensor_quantization_supported(struct pipe_tensor *tensor)
+{
+   /*
+    * Per-axis quantization not supported, for details see:
+    * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
+    */
+   return tensor->scales == NULL && tensor->zero_points == NULL;
+}
+
+bool
+thames_ml_operation_supported(struct pipe_context *pcontext,
+                              const struct pipe_ml_operation *operation)
+{
+   bool supported = false;
+
+   switch (operation->type) {
+   case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
+      struct pipe_tensor *input_tensor = operation->input_tensors[0];
+      struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
+      struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
+      struct pipe_tensor *output_tensor = operation->output_tensors[0];
+
+      // Dilation and per-axis quantization not yet implemented
+      if (tensor_quantization_supported(input_tensor) &&
+          tensor_quantization_supported(weight_tensor) &&
+          tensor_quantization_supported(bias_tensor) &&
+          tensor_quantization_supported(output_tensor) &&
+          operation->conv.dilation_width_factor == 1 &&
+          operation->conv.dilation_height_factor == 1)
+         supported = true;
+
+      break;
+   }
+   case PIPE_ML_OPERATION_TYPE_ADD:
+      supported = operation->input_tensors[0]->resource == NULL &&
+                  operation->input_tensors[1]->resource == NULL;
+      break;
+   case PIPE_ML_OPERATION_TYPE_POOLING:
+   case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE:
+   case PIPE_ML_OPERATION_TYPE_PAD:
+   case PIPE_ML_OPERATION_TYPE_RESIZE:
+      supported = true;
+      break;
+   case PIPE_ML_OPERATION_TYPE_CONCATENATION:
+      supported = operation->conc.axis == 3 ||
+                  operation->conc.axis == -1;
+      break;
+   default:
+      supported = false;
+   }
+
+   return supported;
+}
+
+static const uint8_t kernel_data[] = {
+#include "thames_kernel_bin.h"
+};
+
+struct pipe_ml_subgraph *
+thames_ml_subgraph_create(struct pipe_context *pcontext,
+                          const struct pipe_ml_operation *poperations,
+                          unsigned count)
+{
+   struct pipe_screen *pscreen = pcontext->screen;
+   struct thames_screen *screen = thames_screen(pscreen);
+   struct thames_subgraph *subgraph;
+
+   subgraph = calloc(1, sizeof(*subgraph));
+   subgraph->base.context = pcontext;
+
+   util_dynarray_init(&subgraph->tensors, NULL);
+   util_dynarray_init(&subgraph->operations, NULL);
+
+   thames_lower_graph(subgraph, poperations, count);
+
+#if 0
+
+   thames_emit_cmdstream(subgraph);
+
+   struct drm_thames_cmdstream_bo_create cmd_bo_create = {
+      .size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor),
+      .data = (uintptr_t)subgraph->cmdstream,
+   };
+
+   if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
+      thames_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor));
+
+   int ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE, &cmd_bo_create);
+   assert(ret == 0);
+
+   free(subgraph->cmdstream);
+
+   subgraph->cmdstream_bo = cmd_bo_create.handle;
+
+   if (subgraph->coefs_used > 0) {
+      subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used);
+      pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs);
+
+      free(subgraph->coefs);
+      subgraph->coefs = NULL;
+
+      if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
+         struct pipe_transfer *transfer_in;
+         uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc,
+                                        PIPE_MAP_READ, &transfer_in);
+         thames_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc));
+         pipe_buffer_unmap(subgraph->base.context, transfer_in);
+      }
+   }
+#endif
+
+   subgraph->kernel_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, ARRAY_SIZE(kernel_data));
+   pipe_buffer_write(pcontext, subgraph->kernel_rsrc, 0, ARRAY_SIZE(kernel_data), kernel_data);
+
+   struct pipe_transfer *transfer_in;
+   uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->kernel_rsrc,
+                                  PIPE_MAP_READ, &transfer_in);
+   DBG("Copied string %s to BO %d at %p\n", kernel_data, thames_resource(subgraph->kernel_rsrc)->handle, buf);
+
+   subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used);
+
+   return &subgraph->base;
+}
+
+void
+thames_ml_subgraph_invoke(struct pipe_context *pcontext,
+                          struct pipe_ml_subgraph *psubgraph,
+                          unsigned inputs_count, unsigned input_idxs[],
+                          void *inputs[], bool is_signed[])
+{
+   struct thames_screen *screen = thames_screen(pcontext->screen);
+   struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
+   struct drm_thames_submit submit = {0};
+   struct drm_thames_job job = {0};
+   struct timespec start, end;
+   int ret;
+
+   for (unsigned i = 0; i < inputs_count; i++) {
+      struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]);
+      assert(input);
+
+      if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
+         thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
+
+      pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]);
+   }
+
+   if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
+      struct pipe_transfer *transfer_in;
+      uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
+                                     PIPE_MAP_READ, &transfer_in);
+      thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
+      pipe_buffer_unmap(subgraph->base.context, transfer_in);
+   }
+
+   job.kernel = thames_resource(subgraph->kernel_rsrc)->handle;
+   job.kernel_size = pipe_buffer_size(subgraph->kernel_rsrc);
+
+   if (subgraph->coefs_rsrc)
+      job.region_bo_handles[COEFS_REGION] = thames_resource(subgraph->coefs_rsrc)->handle;
+
+   job.region_bo_handles[IO_REGION] = thames_resource(subgraph->io_rsrc)->handle;
+
+   submit.jobs = (uintptr_t)&job;
+   submit.job_count = 1;
+
+   if (DBG_ENABLED(THAMES_DBG_MSGS))
+      clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+
+   ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_SUBMIT, &submit);
+   assert(ret == 0);
+
+   if (DBG_ENABLED(THAMES_DBG_MSGS)) {
+      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+      long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
+      DBG("Submission took %lld ms\n", duration_ns / 1000000);
+
+      /* Force a sync */
+      struct pipe_transfer *transfer_in;
+      pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in);
+      pipe_buffer_unmap(subgraph->base.context, transfer_in);
+
+      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+      duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
+      DBG("Execution took %lld ms\n", duration_ns / 1000000);
+   }
+}
+
+void
+thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                                struct pipe_ml_subgraph *psubgraph,
+                                unsigned outputs_count,
+                                unsigned output_idxs[], void *outputsv[],
+                                bool is_signed[])
+{
+   struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
+   uint8_t **outputs = (uint8_t **)outputsv;
+
+   for (int i = 0; i < outputs_count; i++) {
+      struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]);
+
+      if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
+         struct pipe_transfer *transfer_in;
+         uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
+                                        PIPE_MAP_READ, &transfer_in);
+         thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
+         pipe_buffer_unmap(subgraph->base.context, transfer_in);
+      }
+
+      pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]);
+   }
+}
+
+void
+thames_ml_subgraph_destroy(struct pipe_context *pcontext,
+                           struct pipe_ml_subgraph *psubgraph)
+{
+   struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
+
+   pipe_resource_reference(&subgraph->io_rsrc, NULL);
+   pipe_resource_reference(&subgraph->coefs_rsrc, NULL);
+   pipe_resource_reference(&subgraph->kernel_rsrc, NULL);
+
+   util_dynarray_fini(&subgraph->operations);
+   util_dynarray_fini(&subgraph->tensors);
+
+   free(subgraph);
+}
--- a/src/gallium/drivers/thames/thames_ml.h
+++ b/src/gallium/drivers/thames/thames_ml.h
@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef THAMES_ML_H
+#define THAMES_ML_H
+
+#include <util/u_dynarray.h>
+
+#include "thames_device.h"
+
+#define SHRAM_BANKS                 48
+#define SHRAM_RESERVED_OUTPUT_BANKS 2
+#define SHRAM_RESERVED_UNUSED_BANKS 2
+#define SHRAM_RESERVED_END_BANKS    2
+#define SHRAM_TOTAL_BANKS           SHRAM_BANKS
+#define SHRAM_BANK_SIZE_BYTES       1024
+#define ACC_BITS                    32 /* Use for now always 32-bit accumulators */
+#define IFM_GRANULE                 8
+#define ACC_GRANULE                 16
+#define ARCH_SPLIT_DEPTH            16
+#define BANK_SIZE_BYTES             1024
+#define IFM_GRANULE                 8
+
+extern struct thames_block ARCH_OFM_BLOCK_MAX;
+extern struct thames_block SUB_KERNEL_MAX;
+extern struct thames_block IFM_UBLOCK;
+extern struct thames_block OFM_UBLOCK;
+
+#define COEFS_REGION   0
+#define IO_REGION      1
+#define SCRATCH_REGION 2
+
+struct thames_block {
+   unsigned width;
+   unsigned height;
+   unsigned depth;
+};
+
+enum thames_operation_type {
+   THAMES_OPERATION_TYPE_CONVOLUTION,
+   THAMES_OPERATION_TYPE_POOLING,
+   THAMES_OPERATION_TYPE_ELTWISE,
+   THAMES_OPERATION_TYPE_DMA,
+};
+
+struct thames_tile_box {
+   unsigned height_0;     /* The height of tile 0 */
+   unsigned height_1;     /* The height of tile 1, 0 if unused */
+   unsigned width_0;      /* The width of tile 0, and tile 2 (if used) */
+   unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */
+};
+
+enum thames_layout {
+   THAMES_LAYOUT_NHWC,
+   THAMES_LAYOUT_NHCWB16,
+};
+
+enum thames_rounding_mode {
+   THAMES_ROUNDING_DOUBLE = 0,
+   THAMES_ROUNDING_TRUNCATE,
+   THAMES_ROUNDING_NATURAL,
+};
+struct thames_feature_map {
+   unsigned tensor_idx;
+   struct thames_block shape;
+   bool is_signed;
+   struct thames_tile_box tiles;
+   unsigned zero_point;
+   float scale;
+};
+
+struct thames_kernel {
+   unsigned height;
+   unsigned width;
+   unsigned stride_y;
+   unsigned stride_x;
+   unsigned dilation_y;
+   unsigned dilation_x;
+   bool depthwise;
+   bool is_signed;
+   unsigned zero_point;
+   float scale;
+};
+
+struct thames_padding {
+   unsigned top;
+   unsigned left;
+   unsigned bottom;
+   unsigned right;
+};
+
+struct thames_address_range {
+   unsigned region;
+   unsigned address;
+   long size;
+};
+
+struct thames_shram_layout {
+   unsigned ib_start;
+   unsigned ib_end;
+   unsigned ib_start2;
+   unsigned ab_start;
+   unsigned lut_start;
+};
+
+enum thames_acc_type {
+   THAMES_ACC_TYPE_INT_32BIT = 0,
+   THAMES_ACC_TYPE_INT_40BIT,
+   THAMES_ACC_TYPE_FP_S5_10,
+};
+
+struct thames_block_config {
+   struct thames_block ifm_block;
+   struct thames_block ofm_block;
+   struct thames_shram_layout shram_layout;
+   unsigned bank_size;
+   enum thames_acc_type acc_type;
+   bool is_partkernel;
+};
+
+#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/
+
+struct thames_operation {
+   enum thames_operation_type type;
+
+   struct thames_block_config block_config;
+
+   union {
+      struct {
+         struct thames_address_range weights;
+         struct thames_address_range scales;
+         bool part_kernel_first;
+         bool depthwise;
+      } conv;
+
+      struct {
+         bool avg; /* true for avg, false for max */
+      } pooling;
+
+      struct {
+         unsigned lut_bytes;
+      } eltwise;
+
+      struct {
+         unsigned address;
+         long size;
+      } dma;
+   };
+
+   struct thames_feature_map ifm;
+   struct thames_feature_map ifm2;
+   struct thames_feature_map ofm;
+
+   struct thames_kernel kernel;
+   struct thames_padding pad;
+   bool upscale;
+   enum thames_rounding_mode round_mode;
+
+   struct thames_address_range read_accesses[MAX_MEMORY_ACCESSES];
+   struct thames_address_range write_accesses[MAX_MEMORY_ACCESSES];
+};
+
+struct thames_tensor {
+   unsigned index;
+   unsigned offset;
+   unsigned size;
+   struct thames_block shape;
+   enum thames_layout layout;
+};
+
+struct thames_subgraph {
+   struct pipe_ml_subgraph base;
+
+   struct util_dynarray operations; /* thames_operation */
+   struct util_dynarray tensors;    /* thames_tensor* */
+
+   struct pipe_resource *kernel_rsrc;
+
+   struct pipe_resource *io_rsrc;
+   unsigned io_used;
+
+   uint8_t *coefs;
+   struct pipe_resource *coefs_rsrc;
+   unsigned coefs_used;
+};
+
+bool
+thames_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
+
+struct pipe_ml_subgraph *
+thames_ml_subgraph_create(struct pipe_context *pcontext,
+                          const struct pipe_ml_operation *poperations,
+                          unsigned count);
+
+void thames_ml_subgraph_invoke(struct pipe_context *pcontext,
+                               struct pipe_ml_subgraph *psubgraph,
+                               unsigned inputs_count, unsigned input_idxs[],
+                               void *inputs[], bool is_signed[]);
+
+void thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
+                                     struct pipe_ml_subgraph *psubgraph,
+                                     unsigned outputs_count,
+                                     unsigned output_idxs[], void *outputs[],
+                                     bool is_signed[]);
+
+void thames_ml_subgraph_destroy(struct pipe_context *context,
+                                struct pipe_ml_subgraph *psubgraph);
+
+void thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map);
+
+void thames_register_tensor(struct thames_subgraph *subgraph, const struct pipe_tensor *ptensor);
+
+struct thames_tensor *thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx);
+
+void thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
+                        int suboperation_nr, int offset, unsigned size);
+
+int thames_round_up_to_multiple(int a, int b);
+
+int thames_round_up_divide(int a, int b);
+
+int thames_quantize_scale(double scale, uint32_t *shift);
+
+#endif /* THAMES_ML_H */
--- a/src/gallium/drivers/thames/thames_public.h
+++ b/src/gallium/drivers/thames/thames_public.h
@ -0,0 +1,3 @@
+#include "pipe/p_state.h"
+
+struct pipe_ml_device *thames_ml_device_create(const char *spec);
--- a/src/gallium/drivers/thames/thames_sched.c
+++ b/src/gallium/drivers/thames/thames_sched.c
@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "thames_sched.h"
+
+static int
+required_input_size(int value, int stride, int border)
+{
+   return (value - 1) * stride + border;
+}
+
+static struct thames_block
+_get_ifm_blocksize(struct thames_operation *operation, struct thames_block ofm_block)
+{
+   struct thames_block ifm_block = {0};
+
+   // IFM block height
+   int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height));
+   h = ALIGN(h, OFM_UBLOCK.height);
+
+   // IFM block width
+   int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width));
+   w = ALIGN(w, OFM_UBLOCK.width);
+
+   ifm_block.height = h;
+   ifm_block.width = w;
+   ifm_block.depth = ofm_block.depth;
+
+   return ifm_block;
+}
+
+static bool
+try_block_config(struct thames_operation *operation, struct thames_block ofm_block, struct thames_block ifm_block, struct thames_shram_layout *layout)
+{
+   int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8);
+   int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
+   int lut_bytes = operation->type == THAMES_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0;
+   int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS);
+   int lut_start = SHRAM_TOTAL_BANKS - lut_banks;
+   int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks;
+   int ifm2_start = ifm_end;
+   int acc_start = lut_start;
+
+   if (operation->type != THAMES_OPERATION_TYPE_ELTWISE) {
+      int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8;
+      int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
+      acc_start -= acc_banks;
+   } else {
+      int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */
+
+      if (ifm2_start + ifm2_banks > acc_start)
+         return false;
+
+      ifm_end = acc_start;
+   }
+
+   if (ifm_end > acc_start)
+      return false;
+
+   layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS;
+   layout->ib_start2 = ifm2_start;
+   layout->ib_end = ifm_end;
+   layout->ab_start = acc_start;
+   layout->lut_start = lut_start;
+
+   return true;
+}
+
+static struct thames_block_config
+find_block_config(struct thames_operation *operation)
+{
+   struct thames_block_config config = {};
+   struct thames_block search_space = ARCH_OFM_BLOCK_MAX;
+   float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth;
+   float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth;
+   bool is_pooling = operation->type == THAMES_OPERATION_TYPE_POOLING;
+   bool is_depthwise = operation->conv.depthwise;
+   bool is_equal_depth = is_pooling || is_depthwise || operation->type == THAMES_OPERATION_TYPE_ELTWISE;
+   bool is_convolution = operation->type == THAMES_OPERATION_TYPE_CONVOLUTION;
+   float best_cost = FLT_MAX;
+   unsigned best_coverage = UINT_MAX;
+
+   search_space.width = MIN2(search_space.width, operation->ofm.shape.width);
+   search_space.height = MIN2(search_space.height, operation->ofm.shape.height);
+   search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth);
+
+   unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH));
+
+   if (depth < operation->ofm.shape.depth) {
+      depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+   }
+
+   search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width);
+   search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height);
+   search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth);
+
+   while (depth <= search_space.depth) {
+      bool wont_fit[search_space.height + 1][search_space.width + 1];
+      memset(wont_fit, 0, sizeof(wont_fit));
+
+      for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) {
+         for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) {
+
+            if (wont_fit[height][width])
+               continue;
+
+            struct thames_block ofm_block = {height, width, depth};
+            struct thames_block ifm_block = _get_ifm_blocksize(operation, ofm_block);
+
+            if (!is_equal_depth)
+               ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);
+
+            // Try to fit the blocks in SHRAM
+            struct thames_shram_layout layout = {0};
+            if (try_block_config(operation, ofm_block, ifm_block, &layout)) {
+
+               struct thames_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width),
+                                                  DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height),
+                                                  DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)};
+               float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width,
+                                  operation->ofm.shape.height / (float)ofm_block.height,
+                                  operation->ofm.shape.depth / (float)ofm_block.depth};
+
+               float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0;
+               float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height;
+               if (!is_depthwise)
+                  weight_fetch *= blocks[2] * ofm_block.depth;
+
+               float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1];
+               if (!is_equal_depth)
+                  ifm_fetch *= full_blocks.depth;
+
+               float relative_cost = 0;
+               if (operation->type != THAMES_OPERATION_TYPE_ELTWISE)
+                  relative_cost = (ifm_fetch + weight_fetch) / ofm_elements;
+               else
+                  relative_cost = ofm_elements / (height * width * depth);
+
+               if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2)
+                  relative_cost /= 2.0f;
+
+               if (relative_cost <= best_cost) {
+                  bool choose_this = false;
+
+                  if (relative_cost == best_cost) {
+                     struct thames_block coverage_shape = {
+                        MIN2(ifm_block.height, operation->ifm.shape.height),
+                        MIN2(ifm_block.width, operation->ifm.shape.width),
+                        MIN2(ifm_block.depth, operation->ifm.shape.depth)};
+                     float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) /
+                                      (float)MAX2(1, coverage_shape.width * coverage_shape.height);
+
+                     if (coverage <= best_coverage && (height <= 4 && width <= 4)) {
+                        best_coverage = coverage;
+                        choose_this = true;
+                     }
+                  } else {
+                     best_coverage = UINT_MAX;
+                     choose_this = true;
+                  }
+
+                  if (choose_this) {
+                     config.shram_layout = layout;
+                     config.ifm_block = ifm_block;
+                     config.ofm_block.height = height;
+                     config.ofm_block.width = width;
+                     config.ofm_block.depth = depth;
+
+                     best_cost = relative_cost;
+                  }
+               }
+            } else {
+               wont_fit[height][width] = true;
+            }
+         }
+      }
+
+      depth += OFM_UBLOCK.depth;
+      if (depth < operation->ofm.shape.depth) {
+         depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+      }
+   }
+
+   return config;
+}
+
+void
+thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation)
+{
+   operation->block_config = find_block_config(operation);
+}
--- a/src/gallium/drivers/thames/thames_sched.h
+++ b/src/gallium/drivers/thames/thames_sched.h
@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef THAMES_SCHED_H
+#define THAMES_SCHED_H
+
+#include "thames_ml.h"
+
+void thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation);
+
+#endif /* THAMES_SCHED_H */
--- a/src/gallium/frontends/teflon/tfl_device.c
+++ b/src/gallium/frontends/teflon/tfl_device.c
@ -856,7 +856,8 @@ find_accel_device()

   for (int i = 0; i < n; i++) {
      if (strstr("rocket", devs[i]->driver_name) ||
-          strstr("ethosu", devs[i]->driver_name))
+          strstr("ethosu", devs[i]->driver_name) ||
+          strstr("thames", devs[i]->driver_name))
         device = devs[i];
      else
         pipe_loader_release(&devs[i], 1);
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@ -196,6 +196,12 @@ if with_gallium_ethosu
 else
  driver_ethosu = declare_dependency()
 endif
+if with_gallium_thames
+  subdir('winsys/thames/drm')
+  subdir('drivers/thames')
+else
+  driver_thames = declare_dependency()
+endif
 if with_gallium_zink
  subdir('drivers/zink')
 else
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@ -59,7 +59,7 @@ libgallium_dri = shared_library(
    driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
    driver_tegra, driver_i915, driver_svga, driver_virgl,
    driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
-    driver_asahi, driver_crocus, driver_rocket, driver_ethosu
+    driver_asahi, driver_crocus, driver_rocket, driver_ethosu, driver_thames
  ],
  install : true,
  name_suffix : libname_suffix,
--- a/src/gallium/targets/teflon/meson.build
+++ b/src/gallium/targets/teflon/meson.build
@ -10,6 +10,7 @@ libteflon = shared_library(
    driver_etnaviv,
    driver_rocket,
    driver_ethosu,
+    driver_thames,
    idep_nir,
    idep_mesautil,
  ],
--- a/src/gallium/winsys/thames/drm/meson.build
+++ b/src/gallium/winsys/thames/drm/meson.build
@ -0,0 +1,13 @@
+# Copyright 2017 Broadcom
+# SPDX-License-Identifier: MIT
+
+libthameswinsys = static_library(
+  'thameswinsys',
+  files('thames_drm_winsys.c'),
+  include_directories : [
+    inc_src, inc_include,
+    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+  ],
+  gnu_symbol_visibility : 'hidden',
+  dependencies: [dep_libdrm, idep_mesautil],
+)
--- a/src/gallium/winsys/thames/drm/thames_drm_public.h
+++ b/src/gallium/winsys/thames/drm/thames_drm_public.h
@ -0,0 +1,17 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef __THAMES_DRM_PUBLIC_H__
+#define __THAMES_DRM_PUBLIC_H__
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+thames_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
+
+#endif /* __THAMES_DRM_PUBLIC_H__ */
--- a/src/gallium/winsys/thames/drm/thames_drm_winsys.c
+++ b/src/gallium/winsys/thames/drm/thames_drm_winsys.c
@ -0,0 +1,19 @@
+/*
+ * Copyright 2014 Broadcom
+ * Copyright 2018 Alyssa Rosenzweig
+ * Copyright 2025 Tomeu Vizoso
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/os_file.h"
+#include "util/u_screen.h"
+
+#include "thames/thames_device.h"
+#include "thames_drm_public.h"
+
+struct pipe_screen *
+thames_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+   return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
+                                         thames_screen_create);
+}
				`@ -0,0 +1 @@`
				`0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x0`