From 931683bab684c2b454adaddd3b4a84219c35afc0 Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Tue, 28 Oct 2025 12:39:12 +0100 Subject: [PATCH] WIP: thames: initial commit --- .clang-format-include | 1 + include/drm-uapi/thames_accel.h | 198 ++++++++ meson.build | 3 +- meson.options | 2 +- src/gallium/auxiliary/pipe-loader/meson.build | 3 + .../auxiliary/pipe-loader/pipe_loader_drm.c | 4 + .../auxiliary/target-helpers/drm_helper.h | 19 + .../target-helpers/drm_helper_public.h | 1 + src/gallium/drivers/thames/.clang-format | 2 + .../drivers/thames/ci/thames-j722s-fails.txt | 0 .../drivers/thames/ci/thames-j722s-flakes.txt | 0 .../drivers/thames/ci/thames-j722s-skips.txt | 14 + src/gallium/drivers/thames/meson.build | 32 ++ src/gallium/drivers/thames/thames_cmd.c | 21 + src/gallium/drivers/thames/thames_cmd.h | 13 + src/gallium/drivers/thames/thames_coefs.c | 17 + src/gallium/drivers/thames/thames_coefs.h | 17 + src/gallium/drivers/thames/thames_device.c | 222 +++++++++ src/gallium/drivers/thames/thames_device.h | 73 +++ .../drivers/thames/thames_kernel_bin.h | 1 + src/gallium/drivers/thames/thames_lower.c | 454 ++++++++++++++++++ src/gallium/drivers/thames/thames_lower.h | 15 + src/gallium/drivers/thames/thames_ml.c | 363 ++++++++++++++ src/gallium/drivers/thames/thames_ml.h | 226 +++++++++ src/gallium/drivers/thames/thames_public.h | 3 + src/gallium/drivers/thames/thames_sched.c | 193 ++++++++ src/gallium/drivers/thames/thames_sched.h | 13 + src/gallium/frontends/teflon/tfl_device.c | 3 +- src/gallium/meson.build | 6 + src/gallium/targets/dri/meson.build | 2 +- src/gallium/targets/teflon/meson.build | 1 + src/gallium/winsys/thames/drm/meson.build | 13 + .../winsys/thames/drm/thames_drm_public.h | 17 + .../winsys/thames/drm/thames_drm_winsys.c | 19 + 34 files changed, 1967 insertions(+), 4 deletions(-) create mode 100644 include/drm-uapi/thames_accel.h create mode 100644 src/gallium/drivers/thames/.clang-format create mode 100644 src/gallium/drivers/thames/ci/thames-j722s-fails.txt create mode 100644 src/gallium/drivers/thames/ci/thames-j722s-flakes.txt create mode 100644 src/gallium/drivers/thames/ci/thames-j722s-skips.txt create mode 100644 src/gallium/drivers/thames/meson.build create mode 100644 src/gallium/drivers/thames/thames_cmd.c create mode 100644 src/gallium/drivers/thames/thames_cmd.h create mode 100644 src/gallium/drivers/thames/thames_coefs.c create mode 100644 src/gallium/drivers/thames/thames_coefs.h create mode 100644 src/gallium/drivers/thames/thames_device.c create mode 100644 src/gallium/drivers/thames/thames_device.h create mode 100644 src/gallium/drivers/thames/thames_kernel_bin.h create mode 100644 src/gallium/drivers/thames/thames_lower.c create mode 100644 src/gallium/drivers/thames/thames_lower.h create mode 100644 src/gallium/drivers/thames/thames_ml.c create mode 100644 src/gallium/drivers/thames/thames_ml.h create mode 100644 src/gallium/drivers/thames/thames_public.h create mode 100644 src/gallium/drivers/thames/thames_sched.c create mode 100644 src/gallium/drivers/thames/thames_sched.h create mode 100644 src/gallium/winsys/thames/drm/meson.build create mode 100644 src/gallium/winsys/thames/drm/thames_drm_public.h create mode 100644 src/gallium/winsys/thames/drm/thames_drm_winsys.c diff --git a/.clang-format-include b/.clang-format-include index ba52553fdc9..99abef3dd8e 100644 --- a/.clang-format-include +++ b/.clang-format-include @@ -5,6 +5,7 @@ src/gallium/drivers/ethosu/**/* src/gallium/drivers/i915 src/gallium/drivers/r300/compiler/* src/gallium/drivers/rocket/**/* +src/gallium/drivers/thames/**/* src/gallium/targets/teflon/**/* src/gallium/frontends/teflon/**/* src/amd/vulkan/**/* diff --git a/include/drm-uapi/thames_accel.h b/include/drm-uapi/thames_accel.h new file mode 100644 index 00000000000..cc1abe306b8 --- /dev/null +++ b/include/drm-uapi/thames_accel.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright (C) 2025 Texas Instruments Incorporated - https://www.ti.com/ */ +#ifndef _THAMES_DRM_H_ +#define _THAMES_DRM_H_ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: IOCTL IDs + * + * enum drm_thames_ioctl_id - IOCTL IDs + * + * Place new ioctls at the end, don't re-order, don't replace or remove entries. + * + * These IDs are not meant to be used directly. Use the DRM_IOCTL_THAMES_xxx + * definitions instead. + */ +enum drm_thames_ioctl_id { + /** @DRM_THAMES_BO_CREATE: Create a buffer object. */ + DRM_THAMES_BO_CREATE, + + /** @DRM_THAMES_BO_WAIT: Wait on a buffer object's fence. */ + DRM_THAMES_BO_WAIT, + + /** + * @DRM_THAMES_BO_MMAP_OFFSET: Get the file offset to pass to + * mmap to map a GEM object. + */ + DRM_THAMES_BO_MMAP_OFFSET, + + /** + * @DRM_THAMES_CMDSTREAM_BO_CREATE: Create a command stream buffer + * object. + */ + DRM_THAMES_CMDSTREAM_BO_CREATE, + + /** @DRM_THAMES_SUBMIT: Submit a job and BOs to run. */ + DRM_THAMES_SUBMIT, +}; + +/** + * DOC: IOCTL arguments + */ + +/** + * enum drm_thames_bo_flags - Buffer object flags, passed at creation time. + */ +enum drm_thames_bo_flags { + /** + * @DRM_THAMES_BO_NO_MMAP: The buffer object will never be CPU-mapped + * in userspace. + */ + DRM_THAMES_BO_NO_MMAP = (1 << 0), +}; + +/** + * struct drm_thames_bo_create - Arguments passed to DRM_IOCTL_THAMES_BO_CREATE. + */ +struct drm_thames_bo_create { + /** + * @size: Requested size for the object + * + * The (page-aligned) allocated size for the object will be returned. + */ + __u64 size; + + /** + * @flags: Flags. Must be a combination of drm_thames_bo_flags flags. + */ + __u32 flags; + + /** + * @handle: Returned handle for the object. + * + * Object handles are nonzero. + */ + __u32 handle; +}; + +/** + * struct drm_thames_bo_mmap_offset - Arguments passed to DRM_IOCTL_THAMES_BO_MMAP_OFFSET. + */ +struct drm_thames_bo_mmap_offset { + /** @handle: Handle of the object we want an mmap offset for. */ + __u32 handle; + + /** @pad: MBZ. */ + __u32 pad; + + /** @offset: The fake offset to use for subsequent mmap calls. */ + __u64 offset; +}; + +/** + * struct drm_thames_wait_bo - ioctl argument for waiting for + * completion of the last DRM_THAMES_SUBMIT on a BO. + * + * This is useful for cases where multiple processes might be + * rendering to a BO and you want to wait for all rendering to be + * completed. + */ +struct drm_thames_bo_wait { + __u32 handle; + __u32 pad; + __s64 timeout_ns; /* absolute */ +}; + + +struct drm_thames_cmdstream_bo_create { + /* Size of the data argument. */ + __u32 size; + + /* Flags, currently must be 0. */ + __u32 flags; + + /* Pointer to the data. */ + __u64 data; + + /** Returned GEM handle for the BO. */ + __u32 handle; + + /* Pad, must be 0. */ + __u32 pad; +}; + +/** + * struct drm_thames_job - A job to be run on the NPU + * + * The kernel will schedule the execution of this job taking into account its + * dependencies with other jobs. All tasks in the same job will be executed + * sequentially on the same core, to benefit from memory residency in SRAM. + */ +struct drm_thames_job { + /** Input: BO handle for kernel. */ + __u32 kernel; + + /** Input: Size in bytes of the compiled kernel. */ + __u32 kernel_size; + +#define THAMES_MAX_REGIONS 8 + /** Input: Array of BO handles for each region. */ + __u32 region_bo_handles[THAMES_MAX_REGIONS]; +}; + +/** + * struct drm_thames_submit - ioctl argument for submitting commands to the NPU. + * + * The kernel will schedule the execution of these jobs in dependency order. + */ +struct drm_thames_submit { + /** Input: Pointer to an array of struct drm_thames_job. */ + __u64 jobs; + + /** Input: Number of jobs passed in. */ + __u32 job_count; + + /** Reserved, must be zero. */ + __u32 pad; +}; + + +/** + * DRM_IOCTL_THAMES() - Build a thames IOCTL number + * @__access: Access type. Must be R, W or RW. + * @__id: One of the DRM_THAMES_xxx id. + * @__type: Suffix of the type being passed to the IOCTL. + * + * Don't use this macro directly, use the DRM_IOCTL_THAMES_xxx + * values instead. + * + * Return: An IOCTL number to be passed to ioctl() from userspace. + */ +#define DRM_IOCTL_THAMES(__access, __id, __type) \ + DRM_IO ## __access(DRM_COMMAND_BASE + DRM_THAMES_ ## __id, \ + struct drm_thames_ ## __type) + +enum { + DRM_IOCTL_THAMES_BO_CREATE = + DRM_IOCTL_THAMES(WR, BO_CREATE, bo_create), + DRM_IOCTL_THAMES_BO_WAIT = + DRM_IOCTL_THAMES(WR, BO_WAIT, bo_wait), + DRM_IOCTL_THAMES_BO_MMAP_OFFSET = + DRM_IOCTL_THAMES(WR, BO_MMAP_OFFSET, bo_mmap_offset), + DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE = + DRM_IOCTL_THAMES(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create), + DRM_IOCTL_THAMES_SUBMIT = + DRM_IOCTL_THAMES(WR, SUBMIT, submit), +}; + +#if defined(__cplusplus) +} +#endif + +#endif /* _THAMES_DRM_H_ */ diff --git a/meson.build b/meson.build index 80917def523..c4273c73c39 100644 --- a/meson.build +++ b/meson.build @@ -186,7 +186,7 @@ elif gallium_drivers.contains('all') gallium_drivers = [ 'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915', 'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris', - 'zink', 'd3d12', 'asahi', 'rocket', 'ethosu' + 'zink', 'd3d12', 'asahi', 'rocket', 'ethosu', 'thames' ] endif @@ -215,6 +215,7 @@ with_gallium_d3d12 = gallium_drivers.contains('d3d12') with_gallium_asahi = gallium_drivers.contains('asahi') with_gallium_rocket = gallium_drivers.contains('rocket') with_gallium_ethosu = gallium_drivers.contains('ethosu') +with_gallium_thames = gallium_drivers.contains('thames') foreach gallium_driver : gallium_drivers pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper()) endforeach diff --git a/meson.options b/meson.options index 75731475c12..d8f22d3a71a 100644 --- a/meson.options +++ b/meson.options @@ -88,7 +88,7 @@ option( 'all', 'auto', 'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris', 'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi', - 'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink', + 'rocket', 'softpipe', 'svga', 'tegra', 'thames', 'v3d', 'vc4', 'virgl', 'zink', ], description : 'List of gallium drivers to build. If this is set to auto ' + 'all drivers applicable to the target OS/architecture ' + diff --git a/src/gallium/auxiliary/pipe-loader/meson.build b/src/gallium/auxiliary/pipe-loader/meson.build index 2bda1d28872..c4003bb5cb1 100644 --- a/src/gallium/auxiliary/pipe-loader/meson.build +++ b/src/gallium/auxiliary/pipe-loader/meson.build @@ -45,6 +45,9 @@ endif if with_gallium_ethosu renderonly_drivers_c_args += '-DGALLIUM_ETHOSU' endif +if with_gallium_thames + renderonly_drivers_c_args += '-DGALLIUM_THAMES' +endif libpipe_loader_static = static_library( 'pipe_loader_static', diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c index ee6500e9e3c..87d2516e1ae 100644 --- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c @@ -87,6 +87,7 @@ static const struct drm_driver_descriptor *driver_descriptors[] = { &rocket_driver_descriptor, ðosu_driver_descriptor, &tegra_driver_descriptor, + &thames_driver_descriptor, &lima_driver_descriptor, &zink_driver_descriptor, }; @@ -383,6 +384,9 @@ pipe_loader_get_compatible_render_capable_device_fds(int kms_only_fd, unsigned i #if defined GALLIUM_ETHOSU "ethosu", #endif +#if defined GALLIUM_THAMES + "thames", +#endif #if defined GALLIUM_V3D "v3d", #endif diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h index 236b68f8c0e..51d4a0b63f9 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper.h @@ -53,6 +53,7 @@ const struct drm_driver_descriptor descriptor_name = { \ #undef GALLIUM_ASAHI #undef GALLIUM_ROCKET #undef GALLIUM_ETHOSU +#undef GALLIUM_THAMES #endif #ifdef GALLIUM_I915 @@ -480,6 +481,24 @@ DRM_DRIVER_DESCRIPTOR(ethosu, NULL, 0) DRM_DRIVER_DESCRIPTOR_STUB(ethosu) #endif +#ifdef GALLIUM_THAMES +#include "thames/drm/thames_drm_public.h" + +static struct pipe_screen * +pipe_thames_create_screen(int fd, const struct pipe_screen_config *config) +{ + struct pipe_screen *screen; + + screen = thames_drm_screen_create(fd, config); + return screen ? debug_screen_wrap(screen) : NULL; +} + +DRM_DRIVER_DESCRIPTOR(thames, NULL, 0) + +#else +DRM_DRIVER_DESCRIPTOR_STUB(thames) +#endif + #ifdef GALLIUM_KMSRO #include "kmsro/drm/kmsro_drm_public.h" diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h index fe0e12280f6..838774deef9 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h @@ -25,6 +25,7 @@ extern const struct drm_driver_descriptor rknpu_driver_descriptor; extern const struct drm_driver_descriptor rocket_driver_descriptor; extern const struct drm_driver_descriptor ethosu_driver_descriptor; extern const struct drm_driver_descriptor tegra_driver_descriptor; +extern const struct drm_driver_descriptor thames_driver_descriptor; extern const struct drm_driver_descriptor lima_driver_descriptor; extern const struct drm_driver_descriptor zink_driver_descriptor; extern const struct drm_driver_descriptor kmsro_driver_descriptor; diff --git a/src/gallium/drivers/thames/.clang-format b/src/gallium/drivers/thames/.clang-format new file mode 100644 index 00000000000..34cd9d7d1d3 --- /dev/null +++ b/src/gallium/drivers/thames/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: InheritParentConfig +DisableFormat: false diff --git a/src/gallium/drivers/thames/ci/thames-j722s-fails.txt b/src/gallium/drivers/thames/ci/thames-j722s-fails.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/gallium/drivers/thames/ci/thames-j722s-flakes.txt b/src/gallium/drivers/thames/ci/thames-j722s-flakes.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/gallium/drivers/thames/ci/thames-j722s-skips.txt b/src/gallium/drivers/thames/ci/thames-j722s-skips.txt new file mode 100644 index 00000000000..65e55edf082 --- /dev/null +++ b/src/gallium/drivers/thames/ci/thames-j722s-skips.txt @@ -0,0 +1,14 @@ +Add.Op/.* +AddQuant.Op/.* +Conv2D.Op/.* +DepthwiseConv2D.Op/.* +FullyConnected.Op/.* + +# Don't support unfused Pad operations yet +Models.Op/yolox_000 +Models.Op/yolox_003 +Models.Op/yolox_012 +Models.Op/yolox_027 +Models.Op/yolox_042 +Models.Op/yolox_077 +Models.Op/yolox_086 diff --git a/src/gallium/drivers/thames/meson.build b/src/gallium/drivers/thames/meson.build new file mode 100644 index 00000000000..d7359d751e2 --- /dev/null +++ b/src/gallium/drivers/thames/meson.build @@ -0,0 +1,32 @@ +# Copyright 2019 Google, Inc +# SPDX-License-Identifier: MIT + +# thames_registers = custom_target( +# 'thames_registers.h', +# input : ['gen_parser.py', 'gen_header.py', 'registers.xml'], +# output : 'thames_registers.h', +# command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'], +# capture : true, +# ) + +files_thames = files( + 'thames_cmd.c', + 'thames_coefs.c', + 'thames_device.c', + 'thames_lower.c', + 'thames_ml.c', + 'thames_sched.c', +) + +libthames = static_library( + 'thames', + [files_thames], #, thames_registers], + include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src], + gnu_symbol_visibility : 'hidden', + dependencies : [idep_mesautil, dep_libdrm], +) + +driver_thames = declare_dependency( + compile_args : '-DGALLIUM_THAMES', + link_with : [libthameswinsys, libthames] +) diff --git a/src/gallium/drivers/thames/thames_cmd.c b/src/gallium/drivers/thames/thames_cmd.c new file mode 100644 index 00000000000..be60083a102 --- /dev/null +++ b/src/gallium/drivers/thames/thames_cmd.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include "util/macros.h" +#include "util/u_dynarray.h" + +#include "thames_cmd.h" +#include "thames_coefs.h" +#include "thames_ml.h" +#include "thames_sched.h" + +void +thames_emit_cmdstream(struct thames_subgraph *subgraph) +{ + +} diff --git a/src/gallium/drivers/thames/thames_cmd.h b/src/gallium/drivers/thames/thames_cmd.h new file mode 100644 index 00000000000..b8281cb6694 --- /dev/null +++ b/src/gallium/drivers/thames/thames_cmd.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#ifndef THAMES_CMD_H +#define THAMES_CMD_H + +#include "thames_ml.h" + +void thames_emit_cmdstream(struct thames_subgraph *subgraph); + +#endif /* THAMES_CMD_H */ diff --git a/src/gallium/drivers/thames/thames_coefs.c b/src/gallium/drivers/thames/thames_coefs.c new file mode 100644 index 00000000000..1f9eadbabcd --- /dev/null +++ b/src/gallium/drivers/thames/thames_coefs.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include "util/u_inlines.h" + +#include "thames_coefs.h" + +void +thames_fill_coefs(struct thames_subgraph *subgraph, + struct thames_operation *operation, + struct pipe_resource *bias_rsrc, + struct pipe_resource *weight_rsrc) +{ + +} diff --git a/src/gallium/drivers/thames/thames_coefs.h b/src/gallium/drivers/thames/thames_coefs.h new file mode 100644 index 00000000000..db2146504f7 --- /dev/null +++ b/src/gallium/drivers/thames/thames_coefs.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#ifndef THAMES_COEFS_H +#define THAMES_COEFS_H + +#include "thames_ml.h" + +void +thames_fill_coefs(struct thames_subgraph *subgraph, + struct thames_operation *operation, + struct pipe_resource *bias_rsrc, + struct pipe_resource *weight_rsrc); + +#endif /* THAMES_COEFS_H */ diff --git a/src/gallium/drivers/thames/thames_device.c b/src/gallium/drivers/thames/thames_device.c new file mode 100644 index 00000000000..315c689017b --- /dev/null +++ b/src/gallium/drivers/thames/thames_device.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include "thames_device.h" +#include "thames_ml.h" + +#include "drm-uapi/thames_accel.h" + +#include +#include "util/os_mman.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/u_transfer.h" + +static const struct debug_named_value thames_debug_options[] = { + {"dbg_msgs", THAMES_DBG_MSGS, "Print debug messages"}, + {"dump_bos", THAMES_DBG_DUMP_BOS, "Dump buffers for analysis"}, + {"zero_bos", THAMES_DBG_ZERO, "Zero buffers for debugging"}, + DEBUG_NAMED_VALUE_END}; + +DEBUG_GET_ONCE_FLAGS_OPTION(thames_debug, "THAMES_DEBUG", thames_debug_options, 0) +int thames_debug = 0; + +static void +thames_destroy_screen(struct pipe_screen *pscreen) +{ + struct thames_screen *screen = thames_screen(pscreen); + + ralloc_free(screen); +} + +static void +thames_destroy_context(struct pipe_context *pctx) +{ + struct thames_context *ctx = thames_context(pctx); + + ralloc_free(ctx); +} + +static void * +thames_buffer_map(struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned level, + unsigned usage, const struct pipe_box *box, + struct pipe_transfer **out_transfer) +{ + struct thames_screen *screen = thames_screen(pctx->screen); + struct thames_resource *rsc = thames_resource(prsc); + struct drm_thames_bo_wait bo_wait = {0}; + struct drm_thames_bo_mmap_offset bo_mmap_offset = {0}; + int ret; + + assert(level == 0); + assert(prsc->target == PIPE_BUFFER); + assert(box->y == 0); + assert(box->z == 0); + assert(box->height == 1); + assert(box->depth == 1); + + struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer); + transfer->level = level; + transfer->usage = usage; + transfer->box = *box; + + pipe_resource_reference(&transfer->resource, prsc); + + bo_wait.handle = rsc->handle; + bo_wait.timeout_ns = INT64_MAX; + + ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_WAIT, &bo_wait); + if (ret == -1) + goto free_transfer; + + bo_mmap_offset.handle = rsc->handle; + ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_MMAP_OFFSET, &bo_mmap_offset); + if (ret == -1) + goto free_transfer; + + uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED, + screen->fd, bo_mmap_offset.offset); + assert(map != MAP_FAILED); + if (map == MAP_FAILED) + goto free_transfer; + + *out_transfer = transfer; + + return map + box->x; + +free_transfer: + pipe_resource_reference(&transfer->resource, NULL); + ralloc_free(transfer); + return NULL; +} + +static void +thames_buffer_unmap(struct pipe_context *pctx, + struct pipe_transfer *transfer) +{ + pipe_resource_reference(&transfer->resource, NULL); + ralloc_free(transfer); +} + +static struct pipe_context * +thames_create_context(struct pipe_screen *screen, + void *priv, unsigned flags) +{ + struct thames_context *ctx = rzalloc(NULL, struct thames_context); + struct pipe_context *pctx = &ctx->base; + + if (!ctx) + return NULL; + + pctx->screen = screen; + pctx->priv = priv; + + pctx->destroy = thames_destroy_context; + + pctx->buffer_map = thames_buffer_map; + pctx->buffer_unmap = thames_buffer_unmap; + pctx->resource_copy_region = util_resource_copy_region; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->clear_buffer = u_default_clear_buffer; + + pctx->ml_operation_supported = thames_ml_operation_supported; + pctx->ml_subgraph_create = thames_ml_subgraph_create; + pctx->ml_subgraph_invoke = thames_ml_subgraph_invoke; + pctx->ml_subgraph_read_output = thames_ml_subgraph_read_outputs; + pctx->ml_subgraph_destroy = thames_ml_subgraph_destroy; + + return pctx; +} + +static struct pipe_resource * +thames_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templat) +{ + struct thames_screen *screen = thames_screen(pscreen); + struct drm_thames_bo_create arg = {0}; + struct thames_resource *rsc; + int ret; + + assert(templat->target == PIPE_BUFFER); + assert(templat->height0 == 1); + assert(templat->depth0 == 1); + assert(templat->array_size == 1); + + rsc = rzalloc(NULL, struct thames_resource); + if (!rsc) + return NULL; + + rsc->base = *templat; + rsc->base.screen = pscreen; + rsc->base.nr_samples = templat->nr_samples; + pipe_reference_init(&rsc->base.reference, 1); + + rsc->bo_size = templat->width0; + + arg.size = templat->width0; + + ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_BO_CREATE, &arg); + if (ret < 0) + goto free_rsc; + + rsc->handle = arg.handle; + + return &rsc->base; + +free_rsc: + ralloc_free(rsc); + return NULL; +} + +static void +thames_resource_destroy(struct pipe_screen *pscreen, + struct pipe_resource *prsc) +{ + struct thames_resource *rsc = thames_resource(prsc); + struct thames_screen *screen = thames_screen(pscreen); + struct drm_gem_close arg = {0}; + int ret; + + arg.handle = rsc->handle; + + ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg); + assert(ret >= 0); + + ralloc_free(rsc); +} + +static int +thames_screen_get_fd(struct pipe_screen *pscreen) +{ + return thames_screen(pscreen)->fd; +} + +struct pipe_screen * +thames_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro) +{ + struct thames_screen *thames_screen; + struct pipe_screen *screen; + + thames_screen = rzalloc(NULL, struct thames_screen); + if (!thames_screen) + return NULL; + + screen = &thames_screen->pscreen; + + thames_debug = debug_get_option_thames_debug(); + + thames_screen->fd = fd; + + screen->get_screen_fd = thames_screen_get_fd; + screen->destroy = thames_destroy_screen; + screen->context_create = thames_create_context; + screen->resource_create = thames_resource_create; + screen->resource_destroy = thames_resource_destroy; + + return screen; +} \ No newline at end of file diff --git a/src/gallium/drivers/thames/thames_device.h b/src/gallium/drivers/thames/thames_device.h new file mode 100644 index 00000000000..5542c7bf1d2 --- /dev/null +++ b/src/gallium/drivers/thames/thames_device.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "renderonly/renderonly.h" +#include "util/log.h" + +#include "drm-uapi/thames_accel.h" + +#ifndef THAMES_SCREEN_H +#define THAMES_SCREEN_H + +enum thames_dbg { + THAMES_DBG_MSGS = BITFIELD_BIT(0), + THAMES_DBG_DUMP_BOS = BITFIELD_BIT(1), + THAMES_DBG_ZERO = BITFIELD_BIT(2), +}; + +extern int thames_debug; + +#define DBG_ENABLED(flag) unlikely(thames_debug &(flag)) + +#define DBG(fmt, ...) \ + do { \ + if (DBG_ENABLED(THAMES_DBG_MSGS)) \ + mesa_logd("%s:%d: " fmt, __func__, __LINE__, \ + ##__VA_ARGS__); \ + } while (0) + +struct thames_screen { + struct pipe_screen pscreen; + + int fd; +}; + +static inline struct thames_screen * +thames_screen(struct pipe_screen *p) +{ + return (struct thames_screen *)p; +} + +struct thames_context { + struct pipe_context base; +}; + +static inline struct thames_context * +thames_context(struct pipe_context *pctx) +{ + return (struct thames_context *)pctx; +} + +struct thames_resource { + struct pipe_resource base; + + uint32_t handle; + uint64_t bo_size; +}; + +static inline struct thames_resource * +thames_resource(struct pipe_resource *p) +{ + return (struct thames_resource *)p; +} + +struct pipe_screen *thames_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro); + +#endif /* THAMES_SCREEN_H */ diff --git a/src/gallium/drivers/thames/thames_kernel_bin.h b/src/gallium/drivers/thames/thames_kernel_bin.h new file mode 100644 index 00000000000..d92e7c7972f --- /dev/null +++ b/src/gallium/drivers/thames/thames_kernel_bin.h @@ -0,0 +1 @@ +0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x0 \ No newline at end of file diff --git a/src/gallium/drivers/thames/thames_lower.c b/src/gallium/drivers/thames/thames_lower.c new file mode 100644 index 00000000000..3aa5e34ad36 --- /dev/null +++ b/src/gallium/drivers/thames/thames_lower.c @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include "thames_lower.h" +#include "thames_coefs.h" +#include "thames_sched.h" + +static bool +is_depthwise(const struct pipe_ml_operation *poperation) +{ + unsigned input_channels = poperation->input_tensors[0]->dims[3]; + unsigned output_channels = poperation->output_tensors[0]->dims[3]; + + return poperation->conv.depthwise && input_channels > 1 && + output_channels > 1; +} + +static unsigned +needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size) +{ + if (input_size % stride == 0) + return MAX2(filter_size - stride, 0); + + return MAX2(filter_size - (input_size % stride), 0); +} + +static bool +thames_is_part_kernel_first(struct thames_operation *operation) +{ + // Determine which block traversal strategy has better DPU utilization + unsigned kernel_size = operation->kernel.height * operation->kernel.width; + unsigned depth = operation->ifm.shape.depth; + float depth_utilization = (float)depth / thames_round_up_to_multiple(depth, 32); + float part_kernel_utilization = ((float)depth / thames_round_up_to_multiple(depth, 8)); + part_kernel_utilization *= (float)kernel_size / thames_round_up_to_multiple(kernel_size, 4); + + if (operation->type != THAMES_OPERATION_TYPE_CONVOLUTION) + return false; + + if (operation->kernel.depthwise) + return false; + + // Part-kernel first is always better for ifm depths <= 8 + if (part_kernel_utilization >= depth_utilization || depth <= 8) + return true; + + return false; +} + +static void +set_feature_maps(struct pipe_tensor *input_tensor, + struct pipe_tensor *output_tensor, + struct thames_operation *operation) +{ + operation->ifm.tensor_idx = input_tensor->index; + operation->ifm.shape.height = input_tensor->dims[1]; + operation->ifm.shape.width = input_tensor->dims[2]; + operation->ifm.shape.depth = input_tensor->dims[3]; + operation->ifm.zero_point = input_tensor->zero_point; + operation->ifm.scale = input_tensor->scale; + operation->ifm.is_signed = input_tensor->is_signed; + + operation->ofm.tensor_idx = output_tensor->index; + operation->ofm.shape.height = output_tensor->dims[1]; + operation->ofm.shape.width = output_tensor->dims[2]; + operation->ofm.shape.depth = output_tensor->dims[3]; + operation->ofm.zero_point = output_tensor->zero_point; + operation->ofm.scale = output_tensor->scale; + operation->ofm.is_signed = output_tensor->is_signed; +} + +static const struct pipe_ml_operation * +thames_find_first_consumer(const struct pipe_ml_operation *poperations, + unsigned count, + unsigned tensor_index) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + for (unsigned j = 0; j < poperation->input_count; j++) + if (poperation->input_tensors[j]->index == tensor_index) + return poperation; + } + + return NULL; +} + +static void +allocate_feature_maps(struct thames_subgraph *subgraph, struct thames_operation *operation) +{ + thames_allocate_feature_map(subgraph, &operation->ifm); + operation->ifm.tiles.height_0 = operation->ifm.shape.height; + operation->ifm.tiles.height_1 = operation->ifm.shape.height; + operation->ifm.tiles.width_0 = operation->ifm.shape.width; + + thames_allocate_feature_map(subgraph, &operation->ofm); + operation->ofm.tiles.height_0 = operation->ofm.shape.height; + operation->ofm.tiles.height_1 = operation->ofm.shape.height; + operation->ofm.tiles.width_0 = operation->ofm.shape.width; +} + +static const struct pipe_ml_operation * +thames_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count, + unsigned tensor_index) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + + for (unsigned j = 0; j < poperation->output_count; j++) { + if (poperation->output_tensors[j]->index == tensor_index) + return poperation; + } + } + + return NULL; +} + +static void +thames_lower_convolution(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct pipe_tensor *input_tensor, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_CONVOLUTION; + + operation->conv.depthwise = is_depthwise(poperation); + // operation->padding_same = poperation->conv.padding_same; + // operation->stride = poperation->conv.stride_x; + + set_feature_maps(input_tensor, poperation->output_tensors[0], operation); + + operation->kernel.height = poperation->conv.weight_tensor->dims[1]; + operation->kernel.width = poperation->conv.weight_tensor->dims[2]; + operation->kernel.stride_y = poperation->conv.stride_y; + operation->kernel.stride_x = poperation->conv.stride_x; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + operation->kernel.depthwise = is_depthwise(poperation); + operation->kernel.scale = poperation->conv.weight_tensor->scale; + operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point; + operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed; + + operation->conv.part_kernel_first = thames_is_part_kernel_first(operation); + + if (poperation->conv.padding_same) { + unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]); + unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]); + + operation->pad.top = vert / 2; + operation->pad.left = horiz / 2; + operation->pad.bottom = (vert + 1) / 2; + operation->pad.right = (horiz + 1) / 2; + } else { + operation->pad.top = 0; + operation->pad.left = 0; + operation->pad.bottom = 0; + operation->pad.right = 0; + } + + allocate_feature_maps(subgraph, operation); + + thames_sched_operation(subgraph, operation); + thames_fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource); +} + +static void +thames_lower_pooling(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_POOLING; + operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + + operation->kernel.height = poperation->pooling.filter_height; + operation->kernel.width = poperation->pooling.filter_width; + operation->kernel.stride_y = poperation->pooling.stride_y; + operation->kernel.stride_x = poperation->pooling.stride_x; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + if (poperation->pooling.padding_same) { + unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height); + unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width); + + operation->pad.top = vert / 2; + operation->pad.left = horiz / 2; + operation->pad.bottom = (vert + 1) / 2; + operation->pad.right = (horiz + 1) / 2; + } else { + operation->pad.top = 0; + operation->pad.left = 0; + operation->pad.bottom = 0; + operation->pad.right = 0; + } + + allocate_feature_maps(subgraph, operation); + thames_sched_operation(subgraph, operation); +} + +static void +thames_lower_concatenation(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + unsigned input_idx, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation); + operation->ofm.shape.depth = operation->ifm.shape.depth; + + operation->round_mode = THAMES_ROUNDING_NATURAL; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + for (unsigned i = 0; i < input_idx; i++) { + struct thames_tensor *tensor = thames_find_tensor(subgraph, operation->ofm.tensor_idx); + + if (tensor->layout == THAMES_LAYOUT_NHWC) + operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3]; + else if (tensor->layout == THAMES_LAYOUT_NHCWB16) + operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16); + else + assert(0 && "Unsupported layout"); + } + + thames_sched_operation(subgraph, operation); +} + +static void +thames_lower_resize(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + operation->ifm.zero_point = 0; + operation->ofm.zero_point = 0; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + operation->upscale = true; + + allocate_feature_maps(subgraph, operation); + thames_sched_operation(subgraph, operation); +} + +static void +thames_lower_strided_slice(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + operation->ifm.shape = operation->ofm.shape; + operation->ifm.zero_point = 0; + operation->ofm.zero_point = 0; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + + unsigned augmented_coord[5]; + augmented_coord[0] = 0; + for (int i = 0; i < 4; ++i) { + augmented_coord[i + 1] = poperation->slice.begin[i]; + } + + unsigned augmented_strides[5]; + augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height; + augmented_strides[1] = 1; + augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width; + augmented_strides[3] = operation->ifm.shape.depth; + augmented_strides[4] = 1; + + unsigned address_offset = 0; + for (int i = 0; i < 5; ++i) + address_offset += augmented_coord[i] * augmented_strides[i]; + + operation->ifm.tiles.addresses[0] += address_offset; + + thames_sched_operation(subgraph, operation); +} + +static void +thames_lower_add(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_ELTWISE; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + + operation->ifm2.tensor_idx = poperation->input_tensors[1]->index; + operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1]; + operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2]; + operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3]; + operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point; + operation->ifm2.scale = poperation->input_tensors[1]->scale; + operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + + thames_allocate_feature_map(subgraph, &operation->ifm2); + operation->ifm2.tiles.height_0 = operation->ifm2.shape.height; + operation->ifm2.tiles.height_1 = operation->ifm2.shape.height; + operation->ifm2.tiles.width_0 = operation->ifm2.shape.width; + + thames_sched_operation(subgraph, operation); +} + +static void +thames_lower_dma(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct thames_operation *conv_operation, + struct thames_operation *operation) +{ + operation->type = THAMES_OPERATION_TYPE_DMA; + + operation->dma.address = conv_operation->conv.scales.address; + operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size; + + conv_operation->conv.scales.region = SCRATCH_REGION; + conv_operation->conv.scales.address = 0; + + conv_operation->conv.weights.region = SCRATCH_REGION; + conv_operation->conv.weights.address = conv_operation->conv.scales.size; +} + +static void +register_tensors(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + + for (unsigned j = 0; j < poperation->input_count; j++) { + struct pipe_tensor *ptensor = poperation->input_tensors[j]; + thames_register_tensor(subgraph, ptensor); + } + + for (unsigned j = 0; j < poperation->output_count; j++) { + struct pipe_tensor *ptensor = poperation->output_tensors[j]; + thames_register_tensor(subgraph, ptensor); + } + } +} + +void +thames_lower_graph(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperations, unsigned count) +{ + register_tensors(subgraph, poperations, count); + + /* Lower */ + for (int i = 0; i < count; i++) { + struct thames_operation operation = {0}; + + switch (poperations[i].type) { + + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + struct pipe_tensor *input_tensor = poperations[i].input_tensors[0]; + const struct pipe_ml_operation *producer = thames_find_first_producer(poperations, count, input_tensor->index); + bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD; + + if (padded_input) { + input_tensor = producer->input_tensors[0]; + } + + thames_lower_convolution(subgraph, &poperations[i], input_tensor, &operation); + + if (padded_input) { + operation.pad.top = 1; + operation.pad.left = 1; + } + + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_ADD: { + thames_lower_add(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_POOLING: { + thames_lower_pooling(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: { + thames_lower_strided_slice(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_CONCATENATION: { + for (int j = 0; j < poperations[i].input_count; j++) { + thames_lower_concatenation(subgraph, &poperations[i], j, &operation); + util_dynarray_append(&subgraph->operations, operation); + } + break; + } + + case PIPE_ML_OPERATION_TYPE_RESIZE: { + thames_lower_resize(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_PAD: { + // Just ignore the pad operation for now, as it will be handled by its consumers + break; + } + + default: + DBG("poperation->type %d\n", poperations[i].type); + UNREACHABLE("Unsupported ML operation type"); + } + } +} \ No newline at end of file diff --git a/src/gallium/drivers/thames/thames_lower.h b/src/gallium/drivers/thames/thames_lower.h new file mode 100644 index 00000000000..07440e991b1 --- /dev/null +++ b/src/gallium/drivers/thames/thames_lower.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#ifndef THAMES_LOWER_H +#define THAMES_LOWER_H + +#include "thames_ml.h" + +void +thames_lower_graph(struct thames_subgraph *subgraph, + const struct pipe_ml_operation *poperations, unsigned count); + +#endif /* THAMES_LOWER_H */ diff --git a/src/gallium/drivers/thames/thames_ml.c b/src/gallium/drivers/thames/thames_ml.c new file mode 100644 index 00000000000..06fcd65fb75 --- /dev/null +++ b/src/gallium/drivers/thames/thames_ml.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_defines.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "util/u_inlines.h" + +#include +#include +#include +#include +#include +#include + +#include "drm-uapi/thames_accel.h" + +#include "thames_cmd.h" +#include "thames_lower.h" +#include "thames_ml.h" + +void +thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size) +{ + char buffer[255]; + + snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr, + suboperation_nr); + + FILE *f = fopen(buffer, "wb"); + assert(f); + fwrite(ptr + offset, 1, size, f); + if (ferror(f)) { + DBG("Error in writing to file: %s\n", strerror(errno)); + } + fflush(f); + fclose(f); +} + +void +thames_register_tensor(struct thames_subgraph *subgraph, + const struct pipe_tensor *ptensor) +{ + struct thames_tensor new_tensor = {0}; + new_tensor.index = ptensor->index; + new_tensor.shape.height = ptensor->dims[1]; + new_tensor.shape.width = ptensor->dims[2]; + new_tensor.shape.depth = ptensor->dims[3]; + new_tensor.layout = THAMES_LAYOUT_NHWC; + util_dynarray_append(&subgraph->tensors, new_tensor); +} + +void +thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map) +{ + struct thames_tensor *tensor = thames_find_tensor(subgraph, feature_map->tensor_idx); + unsigned size; + + if (tensor->layout == THAMES_LAYOUT_NHWC) { + size = tensor->shape.width * tensor->shape.height * tensor->shape.depth; + } else if (tensor->layout == THAMES_LAYOUT_NHCWB16) { + size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16); + } else { + assert(0 && "Unsupported layout"); + size = 0; // This should never happen + } + + assert(tensor); + + if (tensor->size > 0) { + feature_map->tiles.addresses[0] = tensor->offset; + return; + } + + tensor->offset = subgraph->io_used; + tensor->size = size; + subgraph->io_used += ALIGN_POT(size, 16); + + feature_map->tiles.addresses[0] = tensor->offset; +} + +struct thames_tensor * +thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx) +{ + util_dynarray_foreach (&subgraph->tensors, struct thames_tensor, tensor) { + if (tensor->index == tensor_idx) { + return tensor; + } + } + return NULL; +} + +int +thames_round_up_to_multiple(int a, int b) +{ + return ((a + b - 1) / b) * b; +} + +int +thames_round_up_divide(int a, int b) +{ + return (a + b - 1) / b; +} + +int +thames_quantize_scale(double scale, uint32_t *shift) +{ + int exponent = 0; + double significand = frexp(scale, &exponent); + uint32_t quantized_scale = round(significand * (double)(1LL << 31)); + *shift = 31 - exponent; + if (*shift > 63) { + if (quantized_scale > exp2(*shift - 63)) { + quantized_scale = quantized_scale >> (*shift - 63); + *shift = 63; + } else { + // Not possible to get back within bounds, set scale and shift to 0 + // as the shift would shift away all relevant bits anyway. + quantized_scale = 0; + *shift = 0; + } + } else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) { + quantized_scale = quantized_scale << (0 - *shift); + *shift = 0; + } + + return quantized_scale; +} + +static bool +tensor_quantization_supported(struct pipe_tensor *tensor) +{ + /* + * Per-axis quantization not supported, for details see: + * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor + */ + return tensor->scales == NULL && tensor->zero_points == NULL; +} + +bool +thames_ml_operation_supported(struct pipe_context *pcontext, + const struct pipe_ml_operation *operation) +{ + bool supported = false; + + switch (operation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + struct pipe_tensor *input_tensor = operation->input_tensors[0]; + struct pipe_tensor *weight_tensor = operation->conv.weight_tensor; + struct pipe_tensor *bias_tensor = operation->conv.bias_tensor; + struct pipe_tensor *output_tensor = operation->output_tensors[0]; + + // Dilation and per-axis quantization not yet implemented + if (tensor_quantization_supported(input_tensor) && + tensor_quantization_supported(weight_tensor) && + tensor_quantization_supported(bias_tensor) && + tensor_quantization_supported(output_tensor) && + operation->conv.dilation_width_factor == 1 && + operation->conv.dilation_height_factor == 1) + supported = true; + + break; + } + case PIPE_ML_OPERATION_TYPE_ADD: + supported = operation->input_tensors[0]->resource == NULL && + operation->input_tensors[1]->resource == NULL; + break; + case PIPE_ML_OPERATION_TYPE_POOLING: + case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: + case PIPE_ML_OPERATION_TYPE_PAD: + case PIPE_ML_OPERATION_TYPE_RESIZE: + supported = true; + break; + case PIPE_ML_OPERATION_TYPE_CONCATENATION: + supported = operation->conc.axis == 3 || + operation->conc.axis == -1; + break; + default: + supported = false; + } + + return supported; +} + +static const uint8_t kernel_data[] = { +#include "thames_kernel_bin.h" +}; + +struct pipe_ml_subgraph * +thames_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + struct pipe_screen *pscreen = pcontext->screen; + struct thames_screen *screen = thames_screen(pscreen); + struct thames_subgraph *subgraph; + + subgraph = calloc(1, sizeof(*subgraph)); + subgraph->base.context = pcontext; + + util_dynarray_init(&subgraph->tensors, NULL); + util_dynarray_init(&subgraph->operations, NULL); + + thames_lower_graph(subgraph, poperations, count); + +#if 0 + + thames_emit_cmdstream(subgraph); + + struct drm_thames_cmdstream_bo_create cmd_bo_create = { + .size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor), + .data = (uintptr_t)subgraph->cmdstream, + }; + + if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) + thames_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor)); + + int ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_CMDSTREAM_BO_CREATE, &cmd_bo_create); + assert(ret == 0); + + free(subgraph->cmdstream); + + subgraph->cmdstream_bo = cmd_bo_create.handle; + + if (subgraph->coefs_used > 0) { + subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used); + pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs); + + free(subgraph->coefs); + subgraph->coefs = NULL; + + if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, + PIPE_MAP_READ, &transfer_in); + thames_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + } +#endif + + subgraph->kernel_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, ARRAY_SIZE(kernel_data)); + pipe_buffer_write(pcontext, subgraph->kernel_rsrc, 0, ARRAY_SIZE(kernel_data), kernel_data); + + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->kernel_rsrc, + PIPE_MAP_READ, &transfer_in); + DBG("Copied string %s to BO %d at %p\n", kernel_data, thames_resource(subgraph->kernel_rsrc)->handle, buf); + + subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used); + + return &subgraph->base; +} + +void +thames_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]) +{ + struct thames_screen *screen = thames_screen(pcontext->screen); + struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph); + struct drm_thames_submit submit = {0}; + struct drm_thames_job job = {0}; + struct timespec start, end; + int ret; + + for (unsigned i = 0; i < inputs_count; i++) { + struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]); + assert(input); + + if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) + thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size); + + pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]); + } + + if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, + PIPE_MAP_READ, &transfer_in); + thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + + job.kernel = thames_resource(subgraph->kernel_rsrc)->handle; + job.kernel_size = pipe_buffer_size(subgraph->kernel_rsrc); + + if (subgraph->coefs_rsrc) + job.region_bo_handles[COEFS_REGION] = thames_resource(subgraph->coefs_rsrc)->handle; + + job.region_bo_handles[IO_REGION] = thames_resource(subgraph->io_rsrc)->handle; + + submit.jobs = (uintptr_t)&job; + submit.job_count = 1; + + if (DBG_ENABLED(THAMES_DBG_MSGS)) + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + + ret = drmIoctl(screen->fd, DRM_IOCTL_THAMES_SUBMIT, &submit); + assert(ret == 0); + + if (DBG_ENABLED(THAMES_DBG_MSGS)) { + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + DBG("Submission took %lld ms\n", duration_ns / 1000000); + + /* Force a sync */ + struct pipe_transfer *transfer_in; + pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + DBG("Execution took %lld ms\n", duration_ns / 1000000); + } +} + +void +thames_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputsv[], + bool is_signed[]) +{ + struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph); + uint8_t **outputs = (uint8_t **)outputsv; + + for (int i = 0; i < outputs_count; i++) { + struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]); + + if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, + PIPE_MAP_READ, &transfer_in); + thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + + pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]); + } +} + +void +thames_ml_subgraph_destroy(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph) +{ + struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph); + + pipe_resource_reference(&subgraph->io_rsrc, NULL); + pipe_resource_reference(&subgraph->coefs_rsrc, NULL); + pipe_resource_reference(&subgraph->kernel_rsrc, NULL); + + util_dynarray_fini(&subgraph->operations); + util_dynarray_fini(&subgraph->tensors); + + free(subgraph); +} diff --git a/src/gallium/drivers/thames/thames_ml.h b/src/gallium/drivers/thames/thames_ml.h new file mode 100644 index 00000000000..70faf82f7d4 --- /dev/null +++ b/src/gallium/drivers/thames/thames_ml.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2025 Texas Instruments Incorporated - https://www.ti.com/ + * SPDX-License-Identifier: MIT + */ + +#ifndef THAMES_ML_H +#define THAMES_ML_H + +#include + +#include "thames_device.h" + +#define SHRAM_BANKS 48 +#define SHRAM_RESERVED_OUTPUT_BANKS 2 +#define SHRAM_RESERVED_UNUSED_BANKS 2 +#define SHRAM_RESERVED_END_BANKS 2 +#define SHRAM_TOTAL_BANKS SHRAM_BANKS +#define SHRAM_BANK_SIZE_BYTES 1024 +#define ACC_BITS 32 /* Use for now always 32-bit accumulators */ +#define IFM_GRANULE 8 +#define ACC_GRANULE 16 +#define ARCH_SPLIT_DEPTH 16 +#define BANK_SIZE_BYTES 1024 +#define IFM_GRANULE 8 + +extern struct thames_block ARCH_OFM_BLOCK_MAX; +extern struct thames_block SUB_KERNEL_MAX; +extern struct thames_block IFM_UBLOCK; +extern struct thames_block OFM_UBLOCK; + +#define COEFS_REGION 0 +#define IO_REGION 1 +#define SCRATCH_REGION 2 + +struct thames_block { + unsigned width; + unsigned height; + unsigned depth; +}; + +enum thames_operation_type { + THAMES_OPERATION_TYPE_CONVOLUTION, + THAMES_OPERATION_TYPE_POOLING, + THAMES_OPERATION_TYPE_ELTWISE, + THAMES_OPERATION_TYPE_DMA, +}; + +struct thames_tile_box { + unsigned height_0; /* The height of tile 0 */ + unsigned height_1; /* The height of tile 1, 0 if unused */ + unsigned width_0; /* The width of tile 0, and tile 2 (if used) */ + unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */ +}; + +enum thames_layout { + THAMES_LAYOUT_NHWC, + THAMES_LAYOUT_NHCWB16, +}; + +enum thames_rounding_mode { + THAMES_ROUNDING_DOUBLE = 0, + THAMES_ROUNDING_TRUNCATE, + THAMES_ROUNDING_NATURAL, +}; +struct thames_feature_map { + unsigned tensor_idx; + struct thames_block shape; + bool is_signed; + struct thames_tile_box tiles; + unsigned zero_point; + float scale; +}; + +struct thames_kernel { + unsigned height; + unsigned width; + unsigned stride_y; + unsigned stride_x; + unsigned dilation_y; + unsigned dilation_x; + bool depthwise; + bool is_signed; + unsigned zero_point; + float scale; +}; + +struct thames_padding { + unsigned top; + unsigned left; + unsigned bottom; + unsigned right; +}; + +struct thames_address_range { + unsigned region; + unsigned address; + long size; +}; + +struct thames_shram_layout { + unsigned ib_start; + unsigned ib_end; + unsigned ib_start2; + unsigned ab_start; + unsigned lut_start; +}; + +enum thames_acc_type { + THAMES_ACC_TYPE_INT_32BIT = 0, + THAMES_ACC_TYPE_INT_40BIT, + THAMES_ACC_TYPE_FP_S5_10, +}; + +struct thames_block_config { + struct thames_block ifm_block; + struct thames_block ofm_block; + struct thames_shram_layout shram_layout; + unsigned bank_size; + enum thames_acc_type acc_type; + bool is_partkernel; +}; + +#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/ + +struct thames_operation { + enum thames_operation_type type; + + struct thames_block_config block_config; + + union { + struct { + struct thames_address_range weights; + struct thames_address_range scales; + bool part_kernel_first; + bool depthwise; + } conv; + + struct { + bool avg; /* true for avg, false for max */ + } pooling; + + struct { + unsigned lut_bytes; + } eltwise; + + struct { + unsigned address; + long size; + } dma; + }; + + struct thames_feature_map ifm; + struct thames_feature_map ifm2; + struct thames_feature_map ofm; + + struct thames_kernel kernel; + struct thames_padding pad; + bool upscale; + enum thames_rounding_mode round_mode; + + struct thames_address_range read_accesses[MAX_MEMORY_ACCESSES]; + struct thames_address_range write_accesses[MAX_MEMORY_ACCESSES]; +}; + +struct thames_tensor { + unsigned index; + unsigned offset; + unsigned size; + struct thames_block shape; + enum thames_layout layout; +}; + +struct thames_subgraph { + struct pipe_ml_subgraph base; + + struct util_dynarray operations; /* thames_operation */ + struct util_dynarray tensors; /* thames_tensor* */ + + struct pipe_resource *kernel_rsrc; + + struct pipe_resource *io_rsrc; + unsigned io_used; + + uint8_t *coefs; + struct pipe_resource *coefs_rsrc; + unsigned coefs_used; +}; + +bool +thames_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation); + +struct pipe_ml_subgraph * +thames_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count); + +void thames_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]); + +void thames_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputs[], + bool is_signed[]); + +void thames_ml_subgraph_destroy(struct pipe_context *context, + struct pipe_ml_subgraph *psubgraph); + +void thames_allocate_feature_map(struct thames_subgraph *subgraph, struct thames_feature_map *feature_map); + +void thames_register_tensor(struct thames_subgraph *subgraph, const struct pipe_tensor *ptensor); + +struct thames_tensor *thames_find_tensor(struct thames_subgraph *subgraph, unsigned tensor_idx); + +void thames_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size); + +int thames_round_up_to_multiple(int a, int b); + +int thames_round_up_divide(int a, int b); + +int thames_quantize_scale(double scale, uint32_t *shift); + +#endif /* THAMES_ML_H */ diff --git a/src/gallium/drivers/thames/thames_public.h b/src/gallium/drivers/thames/thames_public.h new file mode 100644 index 00000000000..72f02ae53f3 --- /dev/null +++ b/src/gallium/drivers/thames/thames_public.h @@ -0,0 +1,3 @@ +#include "pipe/p_state.h" + +struct pipe_ml_device *thames_ml_device_create(const char *spec); diff --git a/src/gallium/drivers/thames/thames_sched.c b/src/gallium/drivers/thames/thames_sched.c new file mode 100644 index 00000000000..d5e76c761d7 --- /dev/null +++ b/src/gallium/drivers/thames/thames_sched.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "thames_sched.h" + +static int +required_input_size(int value, int stride, int border) +{ + return (value - 1) * stride + border; +} + +static struct thames_block +_get_ifm_blocksize(struct thames_operation *operation, struct thames_block ofm_block) +{ + struct thames_block ifm_block = {0}; + + // IFM block height + int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height)); + h = ALIGN(h, OFM_UBLOCK.height); + + // IFM block width + int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width)); + w = ALIGN(w, OFM_UBLOCK.width); + + ifm_block.height = h; + ifm_block.width = w; + ifm_block.depth = ofm_block.depth; + + return ifm_block; +} + +static bool +try_block_config(struct thames_operation *operation, struct thames_block ofm_block, struct thames_block ifm_block, struct thames_shram_layout *layout) +{ + int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8); + int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE); + int lut_bytes = operation->type == THAMES_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0; + int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS); + int lut_start = SHRAM_TOTAL_BANKS - lut_banks; + int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks; + int ifm2_start = ifm_end; + int acc_start = lut_start; + + if (operation->type != THAMES_OPERATION_TYPE_ELTWISE) { + int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8; + int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE); + acc_start -= acc_banks; + } else { + int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */ + + if (ifm2_start + ifm2_banks > acc_start) + return false; + + ifm_end = acc_start; + } + + if (ifm_end > acc_start) + return false; + + layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS; + layout->ib_start2 = ifm2_start; + layout->ib_end = ifm_end; + layout->ab_start = acc_start; + layout->lut_start = lut_start; + + return true; +} + +static struct thames_block_config +find_block_config(struct thames_operation *operation) +{ + struct thames_block_config config = {}; + struct thames_block search_space = ARCH_OFM_BLOCK_MAX; + float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth; + float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth; + bool is_pooling = operation->type == THAMES_OPERATION_TYPE_POOLING; + bool is_depthwise = operation->conv.depthwise; + bool is_equal_depth = is_pooling || is_depthwise || operation->type == THAMES_OPERATION_TYPE_ELTWISE; + bool is_convolution = operation->type == THAMES_OPERATION_TYPE_CONVOLUTION; + float best_cost = FLT_MAX; + unsigned best_coverage = UINT_MAX; + + search_space.width = MIN2(search_space.width, operation->ofm.shape.width); + search_space.height = MIN2(search_space.height, operation->ofm.shape.height); + search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth); + + unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH)); + + if (depth < operation->ofm.shape.depth) { + depth = ALIGN(depth, ARCH_SPLIT_DEPTH); + } + + search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width); + search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height); + search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth); + + while (depth <= search_space.depth) { + bool wont_fit[search_space.height + 1][search_space.width + 1]; + memset(wont_fit, 0, sizeof(wont_fit)); + + for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) { + for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) { + + if (wont_fit[height][width]) + continue; + + struct thames_block ofm_block = {height, width, depth}; + struct thames_block ifm_block = _get_ifm_blocksize(operation, ofm_block); + + if (!is_equal_depth) + ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth); + + // Try to fit the blocks in SHRAM + struct thames_shram_layout layout = {0}; + if (try_block_config(operation, ofm_block, ifm_block, &layout)) { + + struct thames_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width), + DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height), + DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)}; + float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width, + operation->ofm.shape.height / (float)ofm_block.height, + operation->ofm.shape.depth / (float)ofm_block.depth}; + + float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0; + float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height; + if (!is_depthwise) + weight_fetch *= blocks[2] * ofm_block.depth; + + float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1]; + if (!is_equal_depth) + ifm_fetch *= full_blocks.depth; + + float relative_cost = 0; + if (operation->type != THAMES_OPERATION_TYPE_ELTWISE) + relative_cost = (ifm_fetch + weight_fetch) / ofm_elements; + else + relative_cost = ofm_elements / (height * width * depth); + + if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2) + relative_cost /= 2.0f; + + if (relative_cost <= best_cost) { + bool choose_this = false; + + if (relative_cost == best_cost) { + struct thames_block coverage_shape = { + MIN2(ifm_block.height, operation->ifm.shape.height), + MIN2(ifm_block.width, operation->ifm.shape.width), + MIN2(ifm_block.depth, operation->ifm.shape.depth)}; + float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) / + (float)MAX2(1, coverage_shape.width * coverage_shape.height); + + if (coverage <= best_coverage && (height <= 4 && width <= 4)) { + best_coverage = coverage; + choose_this = true; + } + } else { + best_coverage = UINT_MAX; + choose_this = true; + } + + if (choose_this) { + config.shram_layout = layout; + config.ifm_block = ifm_block; + config.ofm_block.height = height; + config.ofm_block.width = width; + config.ofm_block.depth = depth; + + best_cost = relative_cost; + } + } + } else { + wont_fit[height][width] = true; + } + } + } + + depth += OFM_UBLOCK.depth; + if (depth < operation->ofm.shape.depth) { + depth = ALIGN(depth, ARCH_SPLIT_DEPTH); + } + } + + return config; +} + +void +thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation) +{ + operation->block_config = find_block_config(operation); +} diff --git a/src/gallium/drivers/thames/thames_sched.h b/src/gallium/drivers/thames/thames_sched.h new file mode 100644 index 00000000000..1e296c64e96 --- /dev/null +++ b/src/gallium/drivers/thames/thames_sched.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef THAMES_SCHED_H +#define THAMES_SCHED_H + +#include "thames_ml.h" + +void thames_sched_operation(struct thames_subgraph *subgraph, struct thames_operation *operation); + +#endif /* THAMES_SCHED_H */ diff --git a/src/gallium/frontends/teflon/tfl_device.c b/src/gallium/frontends/teflon/tfl_device.c index 5cd21dc1e95..652dce89eb8 100644 --- a/src/gallium/frontends/teflon/tfl_device.c +++ b/src/gallium/frontends/teflon/tfl_device.c @@ -856,7 +856,8 @@ find_accel_device() for (int i = 0; i < n; i++) { if (strstr("rocket", devs[i]->driver_name) || - strstr("ethosu", devs[i]->driver_name)) + strstr("ethosu", devs[i]->driver_name) || + strstr("thames", devs[i]->driver_name)) device = devs[i]; else pipe_loader_release(&devs[i], 1); diff --git a/src/gallium/meson.build b/src/gallium/meson.build index 9b02fd1189a..fe10ebe2e72 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -196,6 +196,12 @@ if with_gallium_ethosu else driver_ethosu = declare_dependency() endif +if with_gallium_thames + subdir('winsys/thames/drm') + subdir('drivers/thames') +else + driver_thames = declare_dependency() +endif if with_gallium_zink subdir('drivers/zink') else diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build index 134d7e4adf8..cdb99f3e448 100644 --- a/src/gallium/targets/dri/meson.build +++ b/src/gallium/targets/dri/meson.build @@ -59,7 +59,7 @@ libgallium_dri = shared_library( driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, driver_tegra, driver_i915, driver_svga, driver_virgl, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12, - driver_asahi, driver_crocus, driver_rocket, driver_ethosu + driver_asahi, driver_crocus, driver_rocket, driver_ethosu, driver_thames ], install : true, name_suffix : libname_suffix, diff --git a/src/gallium/targets/teflon/meson.build b/src/gallium/targets/teflon/meson.build index e445e091793..8a31d4f8ed2 100644 --- a/src/gallium/targets/teflon/meson.build +++ b/src/gallium/targets/teflon/meson.build @@ -10,6 +10,7 @@ libteflon = shared_library( driver_etnaviv, driver_rocket, driver_ethosu, + driver_thames, idep_nir, idep_mesautil, ], diff --git a/src/gallium/winsys/thames/drm/meson.build b/src/gallium/winsys/thames/drm/meson.build new file mode 100644 index 00000000000..6489975f153 --- /dev/null +++ b/src/gallium/winsys/thames/drm/meson.build @@ -0,0 +1,13 @@ +# Copyright 2017 Broadcom +# SPDX-License-Identifier: MIT + +libthameswinsys = static_library( + 'thameswinsys', + files('thames_drm_winsys.c'), + include_directories : [ + inc_src, inc_include, + inc_gallium, inc_gallium_aux, inc_gallium_drivers, + ], + gnu_symbol_visibility : 'hidden', + dependencies: [dep_libdrm, idep_mesautil], +) diff --git a/src/gallium/winsys/thames/drm/thames_drm_public.h b/src/gallium/winsys/thames/drm/thames_drm_public.h new file mode 100644 index 00000000000..4372b182a7c --- /dev/null +++ b/src/gallium/winsys/thames/drm/thames_drm_public.h @@ -0,0 +1,17 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef __THAMES_DRM_PUBLIC_H__ +#define __THAMES_DRM_PUBLIC_H__ + +struct pipe_screen; +struct pipe_screen_config; + +struct pipe_screen * +thames_drm_screen_create(int drmFD, const struct pipe_screen_config *config); + +#endif /* __THAMES_DRM_PUBLIC_H__ */ diff --git a/src/gallium/winsys/thames/drm/thames_drm_winsys.c b/src/gallium/winsys/thames/drm/thames_drm_winsys.c new file mode 100644 index 00000000000..b233c8f21d7 --- /dev/null +++ b/src/gallium/winsys/thames/drm/thames_drm_winsys.c @@ -0,0 +1,19 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/os_file.h" +#include "util/u_screen.h" + +#include "thames/thames_device.h" +#include "thames_drm_public.h" + +struct pipe_screen * +thames_drm_screen_create(int fd, const struct pipe_screen_config *config) +{ + return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL, + thames_screen_create); +}