WIP: thames: initial commit

This commit is contained in:
Tomeu Vizoso 2025-10-28 12:39:12 +01:00
parent 562bb8b62b
commit 931683bab6
34 changed files with 1967 additions and 4 deletions

View file

@ -5,6 +5,7 @@ src/gallium/drivers/ethosu/**/*
src/gallium/drivers/i915
src/gallium/drivers/r300/compiler/*
src/gallium/drivers/rocket/**/*
src/gallium/drivers/thames/**/*
src/gallium/targets/teflon/**/*
src/gallium/frontends/teflon/**/*
src/amd/vulkan/**/*

View file

@ -0,0 +1,198 @@
/* SPDX-License-Identifier: MIT */
/* Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com/ */
#ifndef _THAMES_DRM_H_
#define _THAMES_DRM_H_
#include "drm.h"
#if defined(__cplusplus)
extern "C" {
#endif
/**
* DOC: IOCTL IDs
*
* enum drm_thames_ioctl_id - IOCTL IDs
*
* Place new ioctls at the end, don't re-order, don't replace or remove entries.
*
* These IDs are not meant to be used directly. Use the DRM_IOCTL_THAMES_xxx
* definitions instead.
*/
enum drm_thames_ioctl_id {
/** @DRM_THAMES_BO_CREATE: Create a buffer object. */
DRM_THAMES_BO_CREATE,
/** @DRM_THAMES_BO_WAIT: Wait on a buffer object's fence. */
DRM_THAMES_BO_WAIT,
/**
* @DRM_THAMES_BO_MMAP_OFFSET: Get the file offset to pass to
* mmap to map a GEM object.
*/
DRM_THAMES_BO_MMAP_OFFSET,
/**
* @DRM_THAMES_CMDSTREAM_BO_CREATE: Create a command stream buffer
* object.
*/
DRM_THAMES_CMDSTREAM_BO_CREATE,
/** @DRM_THAMES_SUBMIT: Submit a job and BOs to run. */
DRM_THAMES_SUBMIT,
};
/**
* DOC: IOCTL arguments
*/
/**
* enum drm_thames_bo_flags - Buffer object flags, passed at creation time.
*/
enum drm_thames_bo_flags {
/**
* @DRM_THAMES_BO_NO_MMAP: The buffer object will never be CPU-mapped
* in userspace.
*/
DRM_THAMES_BO_NO_MMAP = (1 << 0),
};
/**
* struct drm_thames_bo_create - Arguments passed to DRM_IOCTL_THAMES_BO_CREATE.
*/
struct drm_thames_bo_create {
/**
* @size: Requested size for the object
*
* The (page-aligned) allocated size for the object will be returned.
*/
__u64 size;
/**
* @flags: Flags. Must be a combination of drm_thames_bo_flags flags.
*/
__u32 flags;
/**
* @handle: Returned handle for the object.
*
* Object handles are nonzero.
*/
__u32 handle;
};
/**
* struct drm_thames_bo_mmap_offset - Arguments passed to DRM_IOCTL_THAMES_BO_MMAP_OFFSET.
*/
struct drm_thames_bo_mmap_offset {
/** @handle: Handle of the object we want an mmap offset for. */
__u32 handle;
/** @pad: MBZ. */
__u32 pad;
/** @offset: The fake offset to use for subsequent mmap calls. */
__u64 offset;
};
/**
* struct drm_thames_wait_bo - ioctl argument for waiting for
* completion of the last DRM_THAMES_SUBMIT on a BO.
*
* This is useful for cases where multiple processes might be
* rendering to a BO and you want to wait for all rendering to be
* completed.
*/
struct drm_thames_bo_wait {
__u32 handle;
__u32 pad;
__s64 timeout_ns; /* absolute */
};
struct drm_thames_cmdstream_bo_create {
/* Size of the data argument. */
__u32 size;
/* Flags, currently must be 0. */
__u32 flags;
/* Pointer to the data. */
__u64 data;
/** Returned GEM handle for the BO. */
__u32 handle;
/* Pad, must be 0. */
__u32 pad;
};
/**
* struct drm_thames_job - A job to be run on the NPU
*
* The kernel will schedule the execution of this job taking into account its
* dependencies with other jobs. All tasks in the same job will be executed
* sequentially on the same core, to benefit from memory residency in SRAM.
*/
struct drm_thames_job {
/** Input: BO handle for kernel. */
__u32 kernel;
/** Input: Size in bytes of the compiled kernel. */
__u32 kernel_size;
#define THAMES_MAX_REGIONS 8
/** Input: Array of BO handles for each region. */
__u32 region_bo_handles[THAMES_MAX_REGIONS];
};
/**
* struct drm_thames_submit - ioctl argument for submitting commands to the NPU.
*
* The kernel will schedule the execution of these jobs in dependency order.
*/
struct drm_thames_submit {
/** Input: Pointer to an array of struct drm_thames_job. */
__u64 jobs;
/** Input: Number of jobs passed in. */
__u32 job_count;
/** Reserved, must be zero. */
__u32 pad;
};
/**
* DRM_IOCTL_THAMES() - Build a thames IOCTL number
* @__access: Access type. Must be R, W or RW.
* @__id: One of the DRM_THAMES_xxx id.
* @__type: Suffix of the type being passed to the IOCTL.
*
* Don't use this macro directly, use the DRM_IOCTL_THAMES_xxx
* values instead.
*
* Return: An IOCTL number to be passed to ioctl() from userspace.
*/
#define DRM_IOCTL_THAMES(__access, __id, __type) \
DRM_IO ## __access(DRM_COMMAND_BASE + DRM_THAMES_ ## __id, \
struct drm_thames_ ## __type)
enum {
DRM_IOCTL_THAMES_BO_CREATE =
DRM_IOCTL_THAMES(WR, BO_CREATE, bo_create),
DRM_IOCTL_THAMES_BO_WAIT =
DRM_IOCTL_THAMES(WR, BO_WAIT, bo_wait),
DRM_IOCTL_THAMES_BO_MMAP_OFFSET =
DRM_IOCTL_THAMES(WR, BO_MMAP_OFFSET, bo_mmap_offset),
DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE =
DRM_IOCTL_THAMES(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
DRM_IOCTL_THAMES_SUBMIT =
DRM_IOCTL_THAMES(WR, SUBMIT, submit),
};
#if defined(__cplusplus)
}
#endif
#endif /* _THAMES_DRM_H_ */

View file

@ -186,7 +186,7 @@ elif gallium_drivers.contains('all')
gallium_drivers = [
'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
'zink', 'd3d12', 'asahi', 'rocket', 'ethosu'
'zink', 'd3d12', 'asahi', 'rocket', 'ethosu', 'thames'
]
endif
@ -215,6 +215,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12')
with_gallium_asahi = gallium_drivers.contains('asahi')
with_gallium_rocket = gallium_drivers.contains('rocket')
with_gallium_ethosu = gallium_drivers.contains('ethosu')
with_gallium_thames = gallium_drivers.contains('thames')
foreach gallium_driver : gallium_drivers
pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
endforeach

View file

@ -88,7 +88,7 @@ option(
'all', 'auto',
'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
'rocket', 'softpipe', 'svga', 'tegra', 'thames', 'v3d', 'vc4', 'virgl', 'zink',
],
description : 'List of gallium drivers to build. If this is set to auto ' +
'all drivers applicable to the target OS/architecture ' +

View file

@ -45,6 +45,9 @@ endif
if with_gallium_ethosu
renderonly_drivers_c_args += '-DGALLIUM_ETHOSU'
endif
if with_gallium_thames
renderonly_drivers_c_args += '-DGALLIUM_THAMES'
endif
libpipe_loader_static = static_library(
'pipe_loader_static',

View file

@ -87,6 +87,7 @@ static const struct drm_driver_descriptor *driver_descriptors[] = {
&rocket_driver_descriptor,
&ethosu_driver_descriptor,
&tegra_driver_descriptor,
&thames_driver_descriptor,
&lima_driver_descriptor,
&zink_driver_descriptor,
};
@ -383,6 +384,9 @@ pipe_loader_get_compatible_render_capable_device_fds(int kms_only_fd, unsigned i
#if defined GALLIUM_ETHOSU
"ethosu",
#endif
#if defined GALLIUM_THAMES
"thames",
#endif
#if defined GALLIUM_V3D
"v3d",
#endif

View file

@ -53,6 +53,7 @@ const struct drm_driver_descriptor descriptor_name = { \
#undef GALLIUM_ASAHI
#undef GALLIUM_ROCKET
#undef GALLIUM_ETHOSU
#undef GALLIUM_THAMES
#endif
#ifdef GALLIUM_I915
@ -480,6 +481,24 @@ DRM_DRIVER_DESCRIPTOR(ethosu, NULL, 0)
DRM_DRIVER_DESCRIPTOR_STUB(ethosu)
#endif
#ifdef GALLIUM_THAMES
#include "thames/drm/thames_drm_public.h"
static struct pipe_screen *
pipe_thames_create_screen(int fd, const struct pipe_screen_config *config)
{
struct pipe_screen *screen;
screen = thames_drm_screen_create(fd, config);
return screen ? debug_screen_wrap(screen) : NULL;
}
DRM_DRIVER_DESCRIPTOR(thames, NULL, 0)
#else
DRM_DRIVER_DESCRIPTOR_STUB(thames)
#endif
#ifdef GALLIUM_KMSRO
#include "kmsro/drm/kmsro_drm_public.h"

View file

@ -25,6 +25,7 @@ extern const struct drm_driver_descriptor rknpu_driver_descriptor;
extern const struct drm_driver_descriptor rocket_driver_descriptor;
extern const struct drm_driver_descriptor ethosu_driver_descriptor;
extern const struct drm_driver_descriptor tegra_driver_descriptor;
extern const struct drm_driver_descriptor thames_driver_descriptor;
extern const struct drm_driver_descriptor lima_driver_descriptor;
extern const struct drm_driver_descriptor zink_driver_descriptor;
extern const struct drm_driver_descriptor kmsro_driver_descriptor;

View file

@ -0,0 +1,2 @@
BasedOnStyle: InheritParentConfig
DisableFormat: false

View file

@ -0,0 +1,14 @@
Add.Op/.*
AddQuant.Op/.*
Conv2D.Op/.*
DepthwiseConv2D.Op/.*
FullyConnected.Op/.*
# Don't support unfused Pad operations yet
Models.Op/yolox_000
Models.Op/yolox_003
Models.Op/yolox_012
Models.Op/yolox_027
Models.Op/yolox_042
Models.Op/yolox_077
Models.Op/yolox_086

View file

@ -0,0 +1,32 @@
# Copyright 2019 Google, Inc
# SPDX-License-Identifier: MIT
# thames_registers = custom_target(
# 'thames_registers.h',
# input : ['gen_parser.py', 'gen_header.py', 'registers.xml'],
# output : 'thames_registers.h',
# command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'],
# capture : true,
# )
files_thames = files(
'thames_cmd.c',
'thames_coefs.c',
'thames_device.c',
'thames_lower.c',
'thames_ml.c',
'thames_sched.c',
)
libthames = static_library(
'thames',
[files_thames], #, thames_registers],
include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
gnu_symbol_visibility : 'hidden',
dependencies : [idep_mesautil, dep_libdrm],
)
driver_thames = declare_dependency(
compile_args : '-DGALLIUM_THAMES',
link_with : [libthameswinsys, libthames]
)

View file

@ -0,0 +1,21 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include <fcntl.h>
#include <math.h>
#include <stdbool.h>
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "thames_cmd.h"
#include "thames_coefs.h"
#include "thames_ml.h"
#include "thames_sched.h"
void
thames_emit_cmdstream(struct thames_subgraph *subgraph)
{
}

View file

@ -0,0 +1,13 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#ifndef THAMES_CMD_H
#define THAMES_CMD_H
#include "thames_ml.h"
void thames_emit_cmdstream(struct thames_subgraph *subgraph);
#endif /* THAMES_CMD_H */

View file

@ -0,0 +1,17 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include "util/u_inlines.h"
#include "thames_coefs.h"
void
thames_fill_coefs(struct thames_subgraph *subgraph,
struct thames_operation *operation,
struct pipe_resource *bias_rsrc,
struct pipe_resource *weight_rsrc)
{
}

View file

@ -0,0 +1,17 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#ifndef THAMES_COEFS_H
#define THAMES_COEFS_H
#include "thames_ml.h"
void
thames_fill_coefs(struct thames_subgraph *subgraph,
struct thames_operation *operation,
struct pipe_resource *bias_rsrc,
struct pipe_resource *weight_rsrc);
#endif /* THAMES_COEFS_H */

View file

@ -0,0 +1,222 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include "thames_device.h"
#include "thames_ml.h"
#include "drm-uapi/thames_accel.h"
#include <xf86drm.h>
#include "util/os_mman.h"
#include "util/u_inlines.h"
#include "util/u_surface.h"
#include "util/u_transfer.h"
static const struct debug_named_value thames_debug_options[] = {
{"dbg_msgs", THAMES_DBG_MSGS, "Print debug messages"},
{"dump_bos", THAMES_DBG_DUMP_BOS, "Dump buffers for analysis"},
{"zero_bos", THAMES_DBG_ZERO, "Zero buffers for debugging"},
DEBUG_NAMED_VALUE_END};
DEBUG_GET_ONCE_FLAGS_OPTION(thames_debug, "THAMES_DEBUG", thames_debug_options, 0)
int thames_debug = 0;
static void
thames_destroy_screen(struct pipe_screen *pscreen)
{
struct thames_screen *screen = thames_screen(pscreen);
ralloc_free(screen);
}
static void
thames_destroy_context(struct pipe_context *pctx)
{
struct thames_context *ctx = thames_context(pctx);
ralloc_free(ctx);
}
static void *
thames_buffer_map(struct pipe_context *pctx,
struct pipe_resource *prsc, unsigned level,
unsigned usage, const struct pipe_box *box,
struct pipe_transfer **out_transfer)
{
struct thames_screen *screen = thames_screen(pctx->screen);
struct thames_resource *rsc = thames_resource(prsc);
struct drm_thames_bo_wait bo_wait = {0};
struct drm_thames_bo_mmap_offset bo_mmap_offset = {0};
int ret;
assert(level == 0);
assert(prsc->target == PIPE_BUFFER);
assert(box->y == 0);
assert(box->z == 0);
assert(box->height == 1);
assert(box->depth == 1);
struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
transfer->level = level;
transfer->usage = usage;
transfer->box = *box;
pipe_resource_reference(&transfer->resource, prsc);
bo_wait.handle = rsc->handle;
bo_wait.timeout_ns = INT64_MAX;
ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_WAIT, &bo_wait);
if (ret == -1)
goto free_transfer;
bo_mmap_offset.handle = rsc->handle;
ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_MMAP_OFFSET, &bo_mmap_offset);
if (ret == -1)
goto free_transfer;
uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
screen->fd, bo_mmap_offset.offset);
assert(map != MAP_FAILED);
if (map == MAP_FAILED)
goto free_transfer;
*out_transfer = transfer;
return map + box->x;
free_transfer:
pipe_resource_reference(&transfer->resource, NULL);
ralloc_free(transfer);
return NULL;
}
static void
thames_buffer_unmap(struct pipe_context *pctx,
struct pipe_transfer *transfer)
{
pipe_resource_reference(&transfer->resource, NULL);
ralloc_free(transfer);
}
static struct pipe_context *
thames_create_context(struct pipe_screen *screen,
void *priv, unsigned flags)
{
struct thames_context *ctx = rzalloc(NULL, struct thames_context);
struct pipe_context *pctx = &ctx->base;
if (!ctx)
return NULL;
pctx->screen = screen;
pctx->priv = priv;
pctx->destroy = thames_destroy_context;
pctx->buffer_map = thames_buffer_map;
pctx->buffer_unmap = thames_buffer_unmap;
pctx->resource_copy_region = util_resource_copy_region;
pctx->buffer_subdata = u_default_buffer_subdata;
pctx->clear_buffer = u_default_clear_buffer;
pctx->ml_operation_supported = thames_ml_operation_supported;
pctx->ml_subgraph_create = thames_ml_subgraph_create;
pctx->ml_subgraph_invoke = thames_ml_subgraph_invoke;
pctx->ml_subgraph_read_output = thames_ml_subgraph_read_outputs;
pctx->ml_subgraph_destroy = thames_ml_subgraph_destroy;
return pctx;
}
static struct pipe_resource *
thames_resource_create(struct pipe_screen *pscreen,
const struct pipe_resource *templat)
{
struct thames_screen *screen = thames_screen(pscreen);
struct drm_thames_bo_create arg = {0};
struct thames_resource *rsc;
int ret;
assert(templat->target == PIPE_BUFFER);
assert(templat->height0 == 1);
assert(templat->depth0 == 1);
assert(templat->array_size == 1);
rsc = rzalloc(NULL, struct thames_resource);
if (!rsc)
return NULL;
rsc->base = *templat;
rsc->base.screen = pscreen;
rsc->base.nr_samples = templat->nr_samples;
pipe_reference_init(&rsc->base.reference, 1);
rsc->bo_size = templat->width0;
arg.size = templat->width0;
ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_CREATE, &arg);
if (ret < 0)
goto free_rsc;
rsc->handle = arg.handle;
return &rsc->base;
free_rsc:
ralloc_free(rsc);
return NULL;
}
static void
thames_resource_destroy(struct pipe_screen *pscreen,
struct pipe_resource *prsc)
{
struct thames_resource *rsc = thames_resource(prsc);
struct thames_screen *screen = thames_screen(pscreen);
struct drm_gem_close arg = {0};
int ret;
arg.handle = rsc->handle;
ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
assert(ret >= 0);
ralloc_free(rsc);
}
static int
thames_screen_get_fd(struct pipe_screen *pscreen)
{
return thames_screen(pscreen)->fd;
}
struct pipe_screen *
thames_screen_create(int fd,
const struct pipe_screen_config *config,
struct renderonly *ro)
{
struct thames_screen *thames_screen;
struct pipe_screen *screen;
thames_screen = rzalloc(NULL, struct thames_screen);
if (!thames_screen)
return NULL;
screen = &thames_screen->pscreen;
thames_debug = debug_get_option_thames_debug();
thames_screen->fd = fd;
screen->get_screen_fd = thames_screen_get_fd;
screen->destroy = thames_destroy_screen;
screen->context_create = thames_create_context;
screen->resource_create = thames_resource_create;
screen->resource_destroy = thames_resource_destroy;
return screen;
}

View file

@ -0,0 +1,73 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include "pipe/p_context.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "renderonly/renderonly.h"
#include "util/log.h"
#include "drm-uapi/thames_accel.h"
#ifndef THAMES_SCREEN_H
#define THAMES_SCREEN_H
enum thames_dbg {
THAMES_DBG_MSGS = BITFIELD_BIT(0),
THAMES_DBG_DUMP_BOS = BITFIELD_BIT(1),
THAMES_DBG_ZERO = BITFIELD_BIT(2),
};
extern int thames_debug;
#define DBG_ENABLED(flag) unlikely(thames_debug &(flag))
#define DBG(fmt, ...) \
do { \
if (DBG_ENABLED(THAMES_DBG_MSGS)) \
mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
##__VA_ARGS__); \
} while (0)
struct thames_screen {
struct pipe_screen pscreen;
int fd;
};
static inline struct thames_screen *
thames_screen(struct pipe_screen *p)
{
return (struct thames_screen *)p;
}
struct thames_context {
struct pipe_context base;
};
static inline struct thames_context *
thames_context(struct pipe_context *pctx)
{
return (struct thames_context *)pctx;
}
struct thames_resource {
struct pipe_resource base;
uint32_t handle;
uint64_t bo_size;
};
static inline struct thames_resource *
thames_resource(struct pipe_resource *p)
{
return (struct thames_resource *)p;
}
struct pipe_screen *thames_screen_create(int fd,
const struct pipe_screen_config *config,
struct renderonly *ro);
#endif /* THAMES_SCREEN_H */

View file

@ -0,0 +1 @@
0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x0

View file

@ -0,0 +1,454 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include "thames_lower.h"
#include "thames_coefs.h"
#include "thames_sched.h"
static bool
is_depthwise(const struct pipe_ml_operation *poperation)
{
unsigned input_channels = poperation->input_tensors[0]->dims[3];
unsigned output_channels = poperation->output_tensors[0]->dims[3];
return poperation->conv.depthwise && input_channels > 1 &&
output_channels > 1;
}
static unsigned
needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size)
{
if (input_size % stride == 0)
return MAX2(filter_size - stride, 0);
return MAX2(filter_size - (input_size % stride), 0);
}
static bool
thames_is_part_kernel_first(struct thames_operation *operation)
{
// Determine which block traversal strategy has better DPU utilization
unsigned kernel_size = operation->kernel.height * operation->kernel.width;
unsigned depth = operation->ifm.shape.depth;
float depth_utilization = (float)depth / thames_round_up_to_multiple(depth, 32);
float part_kernel_utilization = ((float)depth / thames_round_up_to_multiple(depth, 8));
part_kernel_utilization *= (float)kernel_size / thames_round_up_to_multiple(kernel_size, 4);
if (operation->type != THAMES_OPERATION_TYPE_CONVOLUTION)
return false;
if (operation->kernel.depthwise)
return false;
// Part-kernel first is always better for ifm depths <= 8
if (part_kernel_utilization >= depth_utilization || depth <= 8)
return true;
return false;
}
static void
set_feature_maps(struct pipe_tensor *input_tensor,
struct pipe_tensor *output_tensor,
struct thames_operation *operation)
{
operation->ifm.tensor_idx = input_tensor->index;
operation->ifm.shape.height = input_tensor->dims[1];
operation->ifm.shape.width = input_tensor->dims[2];
operation->ifm.shape.depth = input_tensor->dims[3];
operation->ifm.zero_point = input_tensor->zero_point;
operation->ifm.scale = input_tensor->scale;
operation->ifm.is_signed = input_tensor->is_signed;
operation->ofm.tensor_idx = output_tensor->index;
operation->ofm.shape.height = output_tensor->dims[1];
operation->ofm.shape.width = output_tensor->dims[2];
operation->ofm.shape.depth = output_tensor->dims[3];
operation->ofm.zero_point = output_tensor->zero_point;
operation->ofm.scale = output_tensor->scale;
operation->ofm.is_signed = output_tensor->is_signed;
}
static const struct pipe_ml_operation *
thames_find_first_consumer(const struct pipe_ml_operation *poperations,
unsigned count,
unsigned tensor_index)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->input_count; j++)
if (poperation->input_tensors[j]->index == tensor_index)
return poperation;
}
return NULL;
}
static void
allocate_feature_maps(struct thames_subgraph *subgraph, struct thames_operation *operation)
{
thames_allocate_feature_map(subgraph, &operation->ifm);
operation->ifm.tiles.height_0 = operation->ifm.shape.height;
operation->ifm.tiles.height_1 = operation->ifm.shape.height;
operation->ifm.tiles.width_0 = operation->ifm.shape.width;
thames_allocate_feature_map(subgraph, &operation->ofm);
operation->ofm.tiles.height_0 = operation->ofm.shape.height;
operation->ofm.tiles.height_1 = operation->ofm.shape.height;
operation->ofm.tiles.width_0 = operation->ofm.shape.width;
}
static const struct pipe_ml_operation *
thames_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count,
unsigned tensor_index)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->output_count; j++) {
if (poperation->output_tensors[j]->index == tensor_index)
return poperation;
}
}
return NULL;
}
static void
thames_lower_convolution(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct pipe_tensor *input_tensor,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_CONVOLUTION;
operation->conv.depthwise = is_depthwise(poperation);
// operation->padding_same = poperation->conv.padding_same;
// operation->stride = poperation->conv.stride_x;
set_feature_maps(input_tensor, poperation->output_tensors[0], operation);
operation->kernel.height = poperation->conv.weight_tensor->dims[1];
operation->kernel.width = poperation->conv.weight_tensor->dims[2];
operation->kernel.stride_y = poperation->conv.stride_y;
operation->kernel.stride_x = poperation->conv.stride_x;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
operation->kernel.depthwise = is_depthwise(poperation);
operation->kernel.scale = poperation->conv.weight_tensor->scale;
operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point;
operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed;
operation->conv.part_kernel_first = thames_is_part_kernel_first(operation);
if (poperation->conv.padding_same) {
unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]);
unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]);
operation->pad.top = vert / 2;
operation->pad.left = horiz / 2;
operation->pad.bottom = (vert + 1) / 2;
operation->pad.right = (horiz + 1) / 2;
} else {
operation->pad.top = 0;
operation->pad.left = 0;
operation->pad.bottom = 0;
operation->pad.right = 0;
}
allocate_feature_maps(subgraph, operation);
thames_sched_operation(subgraph, operation);
thames_fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource);
}
static void
thames_lower_pooling(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_POOLING;
operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->kernel.height = poperation->pooling.filter_height;
operation->kernel.width = poperation->pooling.filter_width;
operation->kernel.stride_y = poperation->pooling.stride_y;
operation->kernel.stride_x = poperation->pooling.stride_x;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
if (poperation->pooling.padding_same) {
unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height);
unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width);
operation->pad.top = vert / 2;
operation->pad.left = horiz / 2;
operation->pad.bottom = (vert + 1) / 2;
operation->pad.right = (horiz + 1) / 2;
} else {
operation->pad.top = 0;
operation->pad.left = 0;
operation->pad.bottom = 0;
operation->pad.right = 0;
}
allocate_feature_maps(subgraph, operation);
thames_sched_operation(subgraph, operation);
}
static void
thames_lower_concatenation(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
unsigned input_idx,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation);
operation->ofm.shape.depth = operation->ifm.shape.depth;
operation->round_mode = THAMES_ROUNDING_NATURAL;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
for (unsigned i = 0; i < input_idx; i++) {
struct thames_tensor *tensor = thames_find_tensor(subgraph, operation->ofm.tensor_idx);
if (tensor->layout == THAMES_LAYOUT_NHWC)
operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3];
else if (tensor->layout == THAMES_LAYOUT_NHCWB16)
operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16);
else
assert(0 && "Unsupported layout");
}
thames_sched_operation(subgraph, operation);
}
static void
thames_lower_resize(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm.zero_point = 0;
operation->ofm.zero_point = 0;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
operation->upscale = true;
allocate_feature_maps(subgraph, operation);
thames_sched_operation(subgraph, operation);
}
static void
thames_lower_strided_slice(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm.shape = operation->ofm.shape;
operation->ifm.zero_point = 0;
operation->ofm.zero_point = 0;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
unsigned augmented_coord[5];
augmented_coord[0] = 0;
for (int i = 0; i < 4; ++i) {
augmented_coord[i + 1] = poperation->slice.begin[i];
}
unsigned augmented_strides[5];
augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height;
augmented_strides[1] = 1;
augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width;
augmented_strides[3] = operation->ifm.shape.depth;
augmented_strides[4] = 1;
unsigned address_offset = 0;
for (int i = 0; i < 5; ++i)
address_offset += augmented_coord[i] * augmented_strides[i];
operation->ifm.tiles.addresses[0] += address_offset;
thames_sched_operation(subgraph, operation);
}
static void
thames_lower_add(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_ELTWISE;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm2.tensor_idx = poperation->input_tensors[1]->index;
operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1];
operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2];
operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3];
operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point;
operation->ifm2.scale = poperation->input_tensors[1]->scale;
operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
thames_allocate_feature_map(subgraph, &operation->ifm2);
operation->ifm2.tiles.height_0 = operation->ifm2.shape.height;
operation->ifm2.tiles.height_1 = operation->ifm2.shape.height;
operation->ifm2.tiles.width_0 = operation->ifm2.shape.width;
thames_sched_operation(subgraph, operation);
}
static void
thames_lower_dma(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct thames_operation *conv_operation,
struct thames_operation *operation)
{
operation->type = THAMES_OPERATION_TYPE_DMA;
operation->dma.address = conv_operation->conv.scales.address;
operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size;
conv_operation->conv.scales.region = SCRATCH_REGION;
conv_operation->conv.scales.address = 0;
conv_operation->conv.weights.region = SCRATCH_REGION;
conv_operation->conv.weights.address = conv_operation->conv.scales.size;
}
static void
register_tensors(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperations,
unsigned count)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->input_count; j++) {
struct pipe_tensor *ptensor = poperation->input_tensors[j];
thames_register_tensor(subgraph, ptensor);
}
for (unsigned j = 0; j < poperation->output_count; j++) {
struct pipe_tensor *ptensor = poperation->output_tensors[j];
thames_register_tensor(subgraph, ptensor);
}
}
}
void
thames_lower_graph(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperations, unsigned count)
{
register_tensors(subgraph, poperations, count);
/* Lower */
for (int i = 0; i < count; i++) {
struct thames_operation operation = {0};
switch (poperations[i].type) {
case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
struct pipe_tensor *input_tensor = poperations[i].input_tensors[0];
const struct pipe_ml_operation *producer = thames_find_first_producer(poperations, count, input_tensor->index);
bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD;
if (padded_input) {
input_tensor = producer->input_tensors[0];
}
thames_lower_convolution(subgraph, &poperations[i], input_tensor, &operation);
if (padded_input) {
operation.pad.top = 1;
operation.pad.left = 1;
}
util_dynarray_append(&subgraph->operations, operation);
break;
}
case PIPE_ML_OPERATION_TYPE_ADD: {
thames_lower_add(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, operation);
break;
}
case PIPE_ML_OPERATION_TYPE_POOLING: {
thames_lower_pooling(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, operation);
break;
}
case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: {
thames_lower_strided_slice(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, operation);
break;
}
case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
for (int j = 0; j < poperations[i].input_count; j++) {
thames_lower_concatenation(subgraph, &poperations[i], j, &operation);
util_dynarray_append(&subgraph->operations, operation);
}
break;
}
case PIPE_ML_OPERATION_TYPE_RESIZE: {
thames_lower_resize(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, operation);
break;
}
case PIPE_ML_OPERATION_TYPE_PAD: {
// Just ignore the pad operation for now, as it will be handled by its consumers
break;
}
default:
DBG("poperation->type %d\n", poperations[i].type);
UNREACHABLE("Unsupported ML operation type");
}
}
}

View file

@ -0,0 +1,15 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#ifndef THAMES_LOWER_H
#define THAMES_LOWER_H
#include "thames_ml.h"
void
thames_lower_graph(struct thames_subgraph *subgraph,
const struct pipe_ml_operation *poperations, unsigned count);
#endif /* THAMES_LOWER_H */

View file

@ -0,0 +1,363 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#include "pipe/p_defines.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "util/u_inlines.h"
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <xf86drm.h>
#include "drm-uapi/thames_accel.h"
#include "thames_cmd.h"
#include "thames_lower.h"
#include "thames_ml.h"
void
thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
int suboperation_nr, int offset, unsigned size)
{
char buffer[255];
snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
suboperation_nr);
FILE *f = fopen(buffer, "wb");
assert(f);
fwrite(ptr + offset, 1, size, f);
if (ferror(f)) {
DBG("Error in writing to file: %s\n", strerror(errno));
}
fflush(f);
fclose(f);
}
void
thames_register_tensor(struct thames_subgraph *subgraph,
const struct pipe_tensor *ptensor)
{
struct thames_tensor new_tensor = {0};
new_tensor.index = ptensor->index;
new_tensor.shape.height = ptensor->dims[1];
new_tensor.shape.width = ptensor->dims[2];
new_tensor.shape.depth = ptensor->dims[3];
new_tensor.layout = THAMES_LAYOUT_NHWC;
util_dynarray_append(&subgraph->tensors, new_tensor);
}
void
thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map)
{
struct thames_tensor *tensor = thames_find_tensor(subgraph, feature_map->tensor_idx);
unsigned size;
if (tensor->layout == THAMES_LAYOUT_NHWC) {
size = tensor->shape.width * tensor->shape.height * tensor->shape.depth;
} else if (tensor->layout == THAMES_LAYOUT_NHCWB16) {
size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16);
} else {
assert(0 && "Unsupported layout");
size = 0; // This should never happen
}
assert(tensor);
if (tensor->size > 0) {
feature_map->tiles.addresses[0] = tensor->offset;
return;
}
tensor->offset = subgraph->io_used;
tensor->size = size;
subgraph->io_used += ALIGN_POT(size, 16);
feature_map->tiles.addresses[0] = tensor->offset;
}
struct thames_tensor *
thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx)
{
util_dynarray_foreach (&subgraph->tensors, struct thames_tensor, tensor) {
if (tensor->index == tensor_idx) {
return tensor;
}
}
return NULL;
}
int
thames_round_up_to_multiple(int a, int b)
{
return ((a + b - 1) / b) * b;
}
int
thames_round_up_divide(int a, int b)
{
return (a + b - 1) / b;
}
int
thames_quantize_scale(double scale, uint32_t *shift)
{
int exponent = 0;
double significand = frexp(scale, &exponent);
uint32_t quantized_scale = round(significand * (double)(1LL << 31));
*shift = 31 - exponent;
if (*shift > 63) {
if (quantized_scale > exp2(*shift - 63)) {
quantized_scale = quantized_scale >> (*shift - 63);
*shift = 63;
} else {
// Not possible to get back within bounds, set scale and shift to 0
// as the shift would shift away all relevant bits anyway.
quantized_scale = 0;
*shift = 0;
}
} else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) {
quantized_scale = quantized_scale << (0 - *shift);
*shift = 0;
}
return quantized_scale;
}
static bool
tensor_quantization_supported(struct pipe_tensor *tensor)
{
/*
* Per-axis quantization not supported, for details see:
* https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
*/
return tensor->scales == NULL && tensor->zero_points == NULL;
}
bool
thames_ml_operation_supported(struct pipe_context *pcontext,
const struct pipe_ml_operation *operation)
{
bool supported = false;
switch (operation->type) {
case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
struct pipe_tensor *input_tensor = operation->input_tensors[0];
struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
struct pipe_tensor *output_tensor = operation->output_tensors[0];
// Dilation and per-axis quantization not yet implemented
if (tensor_quantization_supported(input_tensor) &&
tensor_quantization_supported(weight_tensor) &&
tensor_quantization_supported(bias_tensor) &&
tensor_quantization_supported(output_tensor) &&
operation->conv.dilation_width_factor == 1 &&
operation->conv.dilation_height_factor == 1)
supported = true;
break;
}
case PIPE_ML_OPERATION_TYPE_ADD:
supported = operation->input_tensors[0]->resource == NULL &&
operation->input_tensors[1]->resource == NULL;
break;
case PIPE_ML_OPERATION_TYPE_POOLING:
case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE:
case PIPE_ML_OPERATION_TYPE_PAD:
case PIPE_ML_OPERATION_TYPE_RESIZE:
supported = true;
break;
case PIPE_ML_OPERATION_TYPE_CONCATENATION:
supported = operation->conc.axis == 3 ||
operation->conc.axis == -1;
break;
default:
supported = false;
}
return supported;
}
static const uint8_t kernel_data[] = {
#include "thames_kernel_bin.h"
};
struct pipe_ml_subgraph *
thames_ml_subgraph_create(struct pipe_context *pcontext,
const struct pipe_ml_operation *poperations,
unsigned count)
{
struct pipe_screen *pscreen = pcontext->screen;
struct thames_screen *screen = thames_screen(pscreen);
struct thames_subgraph *subgraph;
subgraph = calloc(1, sizeof(*subgraph));
subgraph->base.context = pcontext;
util_dynarray_init(&subgraph->tensors, NULL);
util_dynarray_init(&subgraph->operations, NULL);
thames_lower_graph(subgraph, poperations, count);
#if 0
thames_emit_cmdstream(subgraph);
struct drm_thames_cmdstream_bo_create cmd_bo_create = {
.size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor),
.data = (uintptr_t)subgraph->cmdstream,
};
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
thames_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor));
int ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE, &cmd_bo_create);
assert(ret == 0);
free(subgraph->cmdstream);
subgraph->cmdstream_bo = cmd_bo_create.handle;
if (subgraph->coefs_used > 0) {
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used);
pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs);
free(subgraph->coefs);
subgraph->coefs = NULL;
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc,
PIPE_MAP_READ, &transfer_in);
thames_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
}
#endif
subgraph->kernel_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, ARRAY_SIZE(kernel_data));
pipe_buffer_write(pcontext, subgraph->kernel_rsrc, 0, ARRAY_SIZE(kernel_data), kernel_data);
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->kernel_rsrc,
PIPE_MAP_READ, &transfer_in);
DBG("Copied string %s to BO %d at %p\n", kernel_data, thames_resource(subgraph->kernel_rsrc)->handle, buf);
subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used);
return &subgraph->base;
}
void
thames_ml_subgraph_invoke(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned inputs_count, unsigned input_idxs[],
void *inputs[], bool is_signed[])
{
struct thames_screen *screen = thames_screen(pcontext->screen);
struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
struct drm_thames_submit submit = {0};
struct drm_thames_job job = {0};
struct timespec start, end;
int ret;
for (unsigned i = 0; i < inputs_count; i++) {
struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]);
assert(input);
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]);
}
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
PIPE_MAP_READ, &transfer_in);
thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
job.kernel = thames_resource(subgraph->kernel_rsrc)->handle;
job.kernel_size = pipe_buffer_size(subgraph->kernel_rsrc);
if (subgraph->coefs_rsrc)
job.region_bo_handles[COEFS_REGION] = thames_resource(subgraph->coefs_rsrc)->handle;
job.region_bo_handles[IO_REGION] = thames_resource(subgraph->io_rsrc)->handle;
submit.jobs = (uintptr_t)&job;
submit.job_count = 1;
if (DBG_ENABLED(THAMES_DBG_MSGS))
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_SUBMIT, &submit);
assert(ret == 0);
if (DBG_ENABLED(THAMES_DBG_MSGS)) {
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
DBG("Submission took %lld ms\n", duration_ns / 1000000);
/* Force a sync */
struct pipe_transfer *transfer_in;
pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
DBG("Execution took %lld ms\n", duration_ns / 1000000);
}
}
void
thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned outputs_count,
unsigned output_idxs[], void *outputsv[],
bool is_signed[])
{
struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
uint8_t **outputs = (uint8_t **)outputsv;
for (int i = 0; i < outputs_count; i++) {
struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]);
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
PIPE_MAP_READ, &transfer_in);
thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]);
}
}
void
thames_ml_subgraph_destroy(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph)
{
struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
pipe_resource_reference(&subgraph->io_rsrc, NULL);
pipe_resource_reference(&subgraph->coefs_rsrc, NULL);
pipe_resource_reference(&subgraph->kernel_rsrc, NULL);
util_dynarray_fini(&subgraph->operations);
util_dynarray_fini(&subgraph->tensors);
free(subgraph);
}

View file

@ -0,0 +1,226 @@
/*
* Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/
* SPDX-License-Identifier: MIT
*/
#ifndef THAMES_ML_H
#define THAMES_ML_H
#include <util/u_dynarray.h>
#include "thames_device.h"
#define SHRAM_BANKS 48
#define SHRAM_RESERVED_OUTPUT_BANKS 2
#define SHRAM_RESERVED_UNUSED_BANKS 2
#define SHRAM_RESERVED_END_BANKS 2
#define SHRAM_TOTAL_BANKS SHRAM_BANKS
#define SHRAM_BANK_SIZE_BYTES 1024
#define ACC_BITS 32 /* Use for now always 32-bit accumulators */
#define IFM_GRANULE 8
#define ACC_GRANULE 16
#define ARCH_SPLIT_DEPTH 16
#define BANK_SIZE_BYTES 1024
#define IFM_GRANULE 8
extern struct thames_block ARCH_OFM_BLOCK_MAX;
extern struct thames_block SUB_KERNEL_MAX;
extern struct thames_block IFM_UBLOCK;
extern struct thames_block OFM_UBLOCK;
#define COEFS_REGION 0
#define IO_REGION 1
#define SCRATCH_REGION 2
struct thames_block {
unsigned width;
unsigned height;
unsigned depth;
};
enum thames_operation_type {
THAMES_OPERATION_TYPE_CONVOLUTION,
THAMES_OPERATION_TYPE_POOLING,
THAMES_OPERATION_TYPE_ELTWISE,
THAMES_OPERATION_TYPE_DMA,
};
struct thames_tile_box {
unsigned height_0; /* The height of tile 0 */
unsigned height_1; /* The height of tile 1, 0 if unused */
unsigned width_0; /* The width of tile 0, and tile 2 (if used) */
unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */
};
enum thames_layout {
THAMES_LAYOUT_NHWC,
THAMES_LAYOUT_NHCWB16,
};
enum thames_rounding_mode {
THAMES_ROUNDING_DOUBLE = 0,
THAMES_ROUNDING_TRUNCATE,
THAMES_ROUNDING_NATURAL,
};
struct thames_feature_map {
unsigned tensor_idx;
struct thames_block shape;
bool is_signed;
struct thames_tile_box tiles;
unsigned zero_point;
float scale;
};
struct thames_kernel {
unsigned height;
unsigned width;
unsigned stride_y;
unsigned stride_x;
unsigned dilation_y;
unsigned dilation_x;
bool depthwise;
bool is_signed;
unsigned zero_point;
float scale;
};
struct thames_padding {
unsigned top;
unsigned left;
unsigned bottom;
unsigned right;
};
struct thames_address_range {
unsigned region;
unsigned address;
long size;
};
struct thames_shram_layout {
unsigned ib_start;
unsigned ib_end;
unsigned ib_start2;
unsigned ab_start;
unsigned lut_start;
};
enum thames_acc_type {
THAMES_ACC_TYPE_INT_32BIT = 0,
THAMES_ACC_TYPE_INT_40BIT,
THAMES_ACC_TYPE_FP_S5_10,
};
struct thames_block_config {
struct thames_block ifm_block;
struct thames_block ofm_block;
struct thames_shram_layout shram_layout;
unsigned bank_size;
enum thames_acc_type acc_type;
bool is_partkernel;
};
#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/
struct thames_operation {
enum thames_operation_type type;
struct thames_block_config block_config;
union {
struct {
struct thames_address_range weights;
struct thames_address_range scales;
bool part_kernel_first;
bool depthwise;
} conv;
struct {
bool avg; /* true for avg, false for max */
} pooling;
struct {
unsigned lut_bytes;
} eltwise;
struct {
unsigned address;
long size;
} dma;
};
struct thames_feature_map ifm;
struct thames_feature_map ifm2;
struct thames_feature_map ofm;
struct thames_kernel kernel;
struct thames_padding pad;
bool upscale;
enum thames_rounding_mode round_mode;
struct thames_address_range read_accesses[MAX_MEMORY_ACCESSES];
struct thames_address_range write_accesses[MAX_MEMORY_ACCESSES];
};
struct thames_tensor {
unsigned index;
unsigned offset;
unsigned size;
struct thames_block shape;
enum thames_layout layout;
};
struct thames_subgraph {
struct pipe_ml_subgraph base;
struct util_dynarray operations; /* thames_operation */
struct util_dynarray tensors; /* thames_tensor* */
struct pipe_resource *kernel_rsrc;
struct pipe_resource *io_rsrc;
unsigned io_used;
uint8_t *coefs;
struct pipe_resource *coefs_rsrc;
unsigned coefs_used;
};
bool
thames_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
struct pipe_ml_subgraph *
thames_ml_subgraph_create(struct pipe_context *pcontext,
const struct pipe_ml_operation *poperations,
unsigned count);
void thames_ml_subgraph_invoke(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned inputs_count, unsigned input_idxs[],
void *inputs[], bool is_signed[]);
void thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned outputs_count,
unsigned output_idxs[], void *outputs[],
bool is_signed[]);
void thames_ml_subgraph_destroy(struct pipe_context *context,
struct pipe_ml_subgraph *psubgraph);
void thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map);
void thames_register_tensor(struct thames_subgraph *subgraph, const struct pipe_tensor *ptensor);
struct thames_tensor *thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx);
void thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
int suboperation_nr, int offset, unsigned size);
int thames_round_up_to_multiple(int a, int b);
int thames_round_up_divide(int a, int b);
int thames_quantize_scale(double scale, uint32_t *shift);
#endif /* THAMES_ML_H */

View file

@ -0,0 +1,3 @@
#include "pipe/p_state.h"
struct pipe_ml_device *thames_ml_device_create(const char *spec);

View file

@ -0,0 +1,193 @@
/*
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "thames_sched.h"
static int
required_input_size(int value, int stride, int border)
{
return (value - 1) * stride + border;
}
static struct thames_block
_get_ifm_blocksize(struct thames_operation *operation, struct thames_block ofm_block)
{
struct thames_block ifm_block = {0};
// IFM block height
int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height));
h = ALIGN(h, OFM_UBLOCK.height);
// IFM block width
int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width));
w = ALIGN(w, OFM_UBLOCK.width);
ifm_block.height = h;
ifm_block.width = w;
ifm_block.depth = ofm_block.depth;
return ifm_block;
}
static bool
try_block_config(struct thames_operation *operation, struct thames_block ofm_block, struct thames_block ifm_block, struct thames_shram_layout *layout)
{
int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8);
int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
int lut_bytes = operation->type == THAMES_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0;
int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS);
int lut_start = SHRAM_TOTAL_BANKS - lut_banks;
int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks;
int ifm2_start = ifm_end;
int acc_start = lut_start;
if (operation->type != THAMES_OPERATION_TYPE_ELTWISE) {
int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8;
int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
acc_start -= acc_banks;
} else {
int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */
if (ifm2_start + ifm2_banks > acc_start)
return false;
ifm_end = acc_start;
}
if (ifm_end > acc_start)
return false;
layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS;
layout->ib_start2 = ifm2_start;
layout->ib_end = ifm_end;
layout->ab_start = acc_start;
layout->lut_start = lut_start;
return true;
}
static struct thames_block_config
find_block_config(struct thames_operation *operation)
{
struct thames_block_config config = {};
struct thames_block search_space = ARCH_OFM_BLOCK_MAX;
float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth;
float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth;
bool is_pooling = operation->type == THAMES_OPERATION_TYPE_POOLING;
bool is_depthwise = operation->conv.depthwise;
bool is_equal_depth = is_pooling || is_depthwise || operation->type == THAMES_OPERATION_TYPE_ELTWISE;
bool is_convolution = operation->type == THAMES_OPERATION_TYPE_CONVOLUTION;
float best_cost = FLT_MAX;
unsigned best_coverage = UINT_MAX;
search_space.width = MIN2(search_space.width, operation->ofm.shape.width);
search_space.height = MIN2(search_space.height, operation->ofm.shape.height);
search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth);
unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH));
if (depth < operation->ofm.shape.depth) {
depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
}
search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width);
search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height);
search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth);
while (depth <= search_space.depth) {
bool wont_fit[search_space.height + 1][search_space.width + 1];
memset(wont_fit, 0, sizeof(wont_fit));
for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) {
for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) {
if (wont_fit[height][width])
continue;
struct thames_block ofm_block = {height, width, depth};
struct thames_block ifm_block = _get_ifm_blocksize(operation, ofm_block);
if (!is_equal_depth)
ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);
// Try to fit the blocks in SHRAM
struct thames_shram_layout layout = {0};
if (try_block_config(operation, ofm_block, ifm_block, &layout)) {
struct thames_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width),
DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height),
DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)};
float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width,
operation->ofm.shape.height / (float)ofm_block.height,
operation->ofm.shape.depth / (float)ofm_block.depth};
float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0;
float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height;
if (!is_depthwise)
weight_fetch *= blocks[2] * ofm_block.depth;
float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1];
if (!is_equal_depth)
ifm_fetch *= full_blocks.depth;
float relative_cost = 0;
if (operation->type != THAMES_OPERATION_TYPE_ELTWISE)
relative_cost = (ifm_fetch + weight_fetch) / ofm_elements;
else
relative_cost = ofm_elements / (height * width * depth);
if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2)
relative_cost /= 2.0f;
if (relative_cost <= best_cost) {
bool choose_this = false;
if (relative_cost == best_cost) {
struct thames_block coverage_shape = {
MIN2(ifm_block.height, operation->ifm.shape.height),
MIN2(ifm_block.width, operation->ifm.shape.width),
MIN2(ifm_block.depth, operation->ifm.shape.depth)};
float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) /
(float)MAX2(1, coverage_shape.width * coverage_shape.height);
if (coverage <= best_coverage && (height <= 4 && width <= 4)) {
best_coverage = coverage;
choose_this = true;
}
} else {
best_coverage = UINT_MAX;
choose_this = true;
}
if (choose_this) {
config.shram_layout = layout;
config.ifm_block = ifm_block;
config.ofm_block.height = height;
config.ofm_block.width = width;
config.ofm_block.depth = depth;
best_cost = relative_cost;
}
}
} else {
wont_fit[height][width] = true;
}
}
}
depth += OFM_UBLOCK.depth;
if (depth < operation->ofm.shape.depth) {
depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
}
}
return config;
}
void
thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation)
{
operation->block_config = find_block_config(operation);
}

View file

@ -0,0 +1,13 @@
/*
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef THAMES_SCHED_H
#define THAMES_SCHED_H
#include "thames_ml.h"
void thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation);
#endif /* THAMES_SCHED_H */

View file

@ -856,7 +856,8 @@ find_accel_device()
for (int i = 0; i < n; i++) {
if (strstr("rocket", devs[i]->driver_name) ||
strstr("ethosu", devs[i]->driver_name))
strstr("ethosu", devs[i]->driver_name) ||
strstr("thames", devs[i]->driver_name))
device = devs[i];
else
pipe_loader_release(&devs[i], 1);

View file

@ -196,6 +196,12 @@ if with_gallium_ethosu
else
driver_ethosu = declare_dependency()
endif
if with_gallium_thames
subdir('winsys/thames/drm')
subdir('drivers/thames')
else
driver_thames = declare_dependency()
endif
if with_gallium_zink
subdir('drivers/zink')
else

View file

@ -59,7 +59,7 @@ libgallium_dri = shared_library(
driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
driver_tegra, driver_i915, driver_svga, driver_virgl,
driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
driver_asahi, driver_crocus, driver_rocket, driver_ethosu
driver_asahi, driver_crocus, driver_rocket, driver_ethosu, driver_thames
],
install : true,
name_suffix : libname_suffix,

View file

@ -10,6 +10,7 @@ libteflon = shared_library(
driver_etnaviv,
driver_rocket,
driver_ethosu,
driver_thames,
idep_nir,
idep_mesautil,
],

View file

@ -0,0 +1,13 @@
# Copyright 2017 Broadcom
# SPDX-License-Identifier: MIT
libthameswinsys = static_library(
'thameswinsys',
files('thames_drm_winsys.c'),
include_directories : [
inc_src, inc_include,
inc_gallium, inc_gallium_aux, inc_gallium_drivers,
],
gnu_symbol_visibility : 'hidden',
dependencies: [dep_libdrm, idep_mesautil],
)

View file

@ -0,0 +1,17 @@
/*
* Copyright 2014 Broadcom
* Copyright 2018 Alyssa Rosenzweig
* Copyright 2025 Tomeu Vizoso
* SPDX-License-Identifier: MIT
*/
#ifndef __THAMES_DRM_PUBLIC_H__
#define __THAMES_DRM_PUBLIC_H__
struct pipe_screen;
struct pipe_screen_config;
struct pipe_screen *
thames_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
#endif /* __THAMES_DRM_PUBLIC_H__ */

View file

@ -0,0 +1,19 @@
/*
* Copyright 2014 Broadcom
* Copyright 2018 Alyssa Rosenzweig
* Copyright 2025 Tomeu Vizoso
* SPDX-License-Identifier: MIT
*/
#include "util/os_file.h"
#include "util/u_screen.h"
#include "thames/thames_device.h"
#include "thames_drm_public.h"
struct pipe_screen *
thames_drm_screen_create(int fd, const struct pipe_screen_config *config)
{
return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
thames_screen_create);
}