Use MMALib

This commit is contained in:
Tomeu Vizoso 2026-02-13 10:15:50 +01:00
parent b8319435ef
commit cbc347c3ee
5 changed files with 29426 additions and 329 deletions

View file

@ -27,6 +27,10 @@ scp "${KERNEL_SRC}" "${BOARD_USER}@${BOARD_IP}:/tmp/test_kernel.c"
# Create linker command file on board to ensure proper section placement
echo "[1.5/5] Creating linker.cmd on board..."
ssh "${BOARD_USER}@${BOARD_IP}" "cat > /tmp/linker.cmd <<EOF
MEMORY
{
L2SRAM (RWX) : origin = 0x0, length = 0x200000
}
SECTIONS
{
.text : {
@ -34,7 +38,10 @@ SECTIONS
*(.text)
*(.const)
*(.switch)
}
*(.kernel_data)
*(.data)
*(.bss)
} > L2SRAM
}
EOF"
@ -50,6 +57,8 @@ ssh "${BOARD_USER}@${BOARD_IP}" "cd /tmp && ${CL7X_PATH} -mv7524 --abi=eabi -O0
--reread_libs --warn_sections \
--rom_model \
-l${MMALIB_LIB} \
-l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/mmalib_cn_C7524.lib \
-l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/common_C7524.lib \
-lrts7524_le.lib \
linker.cmd \
-o test_kernel.out"

View file

@ -4,11 +4,20 @@
* This kernel uses TI's MMALib to perform quantized int8 convolution
* accelerated by the C7x Matrix Multiply Accelerator (MMA) hardware.
*
* Data layout contract with Mesa (CPU) side:
* - Input arrives in CHW layout (transposed from NHWC by Mesa)
* Layout: [numInChannels][inChOffset] where inChOffset = ALIGN(H*W, 64)
* - Weights arrive in OHWI layout (TFLite native, no transpose needed)
* Layout: [numOutChannels][kH * kW * numInChannels]
* - Bias: [numOutChannels] int32 values
* - Output produced in CHW layout (Mesa transposes back to NHWC)
* Layout: [numOutChannels][pitchC] where pitchC = ALIGN(outH*outW, 64)
*
* Args:
* args[0] = input tensor (uint8, NHWC layout)
* args[1] = weight tensor (uint8, HWIO layout)
* args[0] = input tensor (uint8, CHW layout, padded to inChOffset per channel)
* args[1] = weight tensor (int8, OHWI layout = [Cout][Cin*Fr*Fc])
* args[2] = bias tensor (int32, per output channel)
* args[3] = output tensor (uint8, NHWC layout)
* args[3] = output tensor (uint8, CHW layout, pitchC per channel)
* args[4] = params struct pointer (uint32):
* params[0] = input_height
* params[1] = input_width
@ -20,14 +29,14 @@
* params[7] = output_channels
* params[8] = stride_h
* params[9] = stride_w
* params[10] = pad_h
* params[11] = pad_w
* params[12] = input_zero_point (unused - MMALib handles this internally)
* params[13] = weight_zero_point (unused - MMALib handles this internally)
* params[14] = output_zero_point (unused - MMALib handles this internally)
* args[5] = quantization params (int32):
* quant[0] = multiplier (int32)
* quant[1] = shift (int32)
* params[10] = pad_top
* params[11] = pad_bottom
* params[12] = pad_left
* params[13] = pad_right
* params[14] = input_zero_point (for padFillValue)
* args[5] = quantization params (uint8 packed):
* quant[0..out_c-1] = scale values (uint8)
* quant[out_c..2*out_c-1] = shift values (uint8)
*
* Compile with:
* See compile-kernel.sh for build procedure
@ -38,68 +47,281 @@
/* MMALib headers - MMA hardware acceleration library */
#include "mmalib.h"
/* Align a value up to the next multiple of 'align' (align must be power of 2) */
#define ALIGN_UP(val, align) (((val) + (align) - 1) & ~((align) - 1))
/*
* Quantized int8 convolution kernel using C7x MMA hardware via MMALib
* Quantized convolution kernel using C7x MMA hardware via MMALib
*
* This is the entry point called from the DRM driver.
* Note: This code runs bare-metal on the DSP, so no stdlib functions.
* Placed in .text.entry section to ensure it's at the beginning of the binary.
*/
/* Static buffers for handle and packed weights */
/* Placed in .kernel_data section so they are uploaded with the kernel code */
#pragma DATA_ALIGN(handle_buffer, 128)
__attribute__((section(".kernel_data")))
static uint8_t handle_buffer[16384];
/* Weights padded to pitchA stride between output channels */
#pragma DATA_ALIGN(weights_padded, 128)
__attribute__((section(".kernel_data")))
static int8_t weights_padded[65536];
/* Weights after MMALib reorder for MMA-friendly layout */
#pragma DATA_ALIGN(weights_reordered, 128)
__attribute__((section(".kernel_data")))
static int8_t weights_reordered[65536];
/* Per-channel scale (uint8) and shift (uint8) for requantization */
#pragma DATA_ALIGN(scale_buf, 128)
__attribute__((section(".kernel_data")))
static uint8_t scale_buf[2048];
#pragma DATA_ALIGN(shift_buf, 128)
__attribute__((section(".kernel_data")))
static uint8_t shift_buf[2048];
#pragma RETAIN(test_kernel)
__attribute__((section(".text.entry")))
int
test_kernel(unsigned long long *args)
{
unsigned char *input = (unsigned char *)args[0];
unsigned char *weights = (unsigned char *)args[1];
int *bias = (int *)args[2];
unsigned char *output = (unsigned char *)args[3];
unsigned int *params = (unsigned int *)args[4];
int *quant = (int *)args[5];
unsigned char *input = (unsigned char *)args[0];
signed char *weights = (signed char *)args[1];
int *bias = (int *)args[2];
unsigned char *output = (unsigned char *)args[3];
unsigned int *params = (unsigned int *)args[4];
unsigned char *quant = (unsigned char *)args[5];
/* Extract convolution parameters */
unsigned int in_h = params[0];
unsigned int in_w = params[1];
unsigned int in_c = params[2];
unsigned int k_h = params[3];
unsigned int k_w = params[4];
unsigned int out_h = params[5];
unsigned int out_w = params[6];
unsigned int out_c = params[7];
unsigned int stride_h = params[8];
unsigned int stride_w = params[9];
int pad_h = (int)params[10];
int pad_w = (int)params[11];
/* Extract quantization parameters */
int multiplier = quant[0];
int shift = quant[1];
/* Basic validation */
if (!input || !weights || !bias || !output || !params || !quant) {
return -1;
}
if (in_w == 0 || in_h == 0 || in_c == 0 ||
k_w == 0 || k_h == 0 || out_c == 0 ||
stride_w == 0 || stride_h == 0) {
return -1;
}
/* TODO: Call MMALib convolution function
*
* This requires:
* 1. Allocating a kernel handle
* 2. Setting up MMALIB_bufParams structures
* 3. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init()
* 4. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec()
*
* For now, return success to test static linking of mmalib_C7524.lib
*/
return 0; /* Success */
/* Extract convolution parameters */
unsigned int in_h = params[0];
unsigned int in_w = params[1];
unsigned int in_c = params[2];
unsigned int k_h = params[3];
unsigned int k_w = params[4];
unsigned int out_h = params[5];
unsigned int out_w = params[6];
unsigned int out_c = params[7];
unsigned int stride_h = params[8];
unsigned int stride_w = params[9];
int pad_top = (int)params[10];
int pad_bottom = (int)params[11];
int pad_left = (int)params[12];
int pad_right = (int)params[13];
int input_zp = (int)params[14];
/* Basic validation */
if (!input || !weights || !bias || !output || !params || !quant)
return -1;
if (in_w == 0 || in_h == 0 || in_c == 0 ||
k_w == 0 || k_h == 0 || out_c == 0 ||
stride_w == 0 || stride_h == 0)
return -1;
/*
* Derived parameters for MMALib (following test case 1005 pattern)
*
* MMALib convolveBias_row operates on channel-first data:
* A matrix (src0) = weights: [numOutChannels][kDim] with stride pitchA
* B matrix (src1) = input: [numInChannels][inChOffset]
* C matrix (dst) = output: [numOutChannels][pitchC]
*/
int kDim = k_h * k_w * in_c; /* elements per filter */
int pitchA = ALIGN_UP(kDim, 64); /* 64-byte aligned weight row stride */
int spatial = in_w * in_h; /* total spatial locations per channel */
int inChOffset = ALIGN_UP(spatial + 64, 64); /* channel stride with extra padding (test 1005 pattern) */
int outSpatial = out_w * out_h;
int pitchC = ALIGN_UP(outSpatial + 64, 64); /* output channel stride with extra padding */
/*
* Copy per-channel scale and shift from the quant buffer.
* Mesa packs them as: [scale0..scaleN-1][shift0..shiftN-1], both uint8.
*/
if (out_c > 2048)
return -2;
for (unsigned int i = 0; i < out_c; i++) {
scale_buf[i] = quant[i];
shift_buf[i] = quant[out_c + i];
}
/*
* Pad weights from OHWI [out_c][kDim] contiguous to [out_c][pitchA] with
* zero-padding between rows for 64-byte alignment.
*/
int weight_padded_size = out_c * pitchA;
if (weight_padded_size > (int)sizeof(weights_padded))
return -2;
for (unsigned int o = 0; o < out_c; o++) {
for (int j = 0; j < kDim; j++)
weights_padded[o * pitchA + j] = weights[o * kDim + j];
for (int j = kDim; j < pitchA; j++)
weights_padded[o * pitchA + j] = 0;
}
/*
* Weight reorder: rearrange from natural OHWI to MMA-friendly tiled layout.
* Must happen before init.
*/
MMALIB_CNN_convolveBias_row_processWeights_Args reorderArgs;
reorderArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
reorderArgs.data_type = MMALIB_UINT8;
reorderArgs.Fr = k_h;
reorderArgs.Fc = k_w;
reorderArgs.pitchA = pitchA;
reorderArgs.numInChPerGroup = in_c;
reorderArgs.subMChannels = out_c;
reorderArgs.No = out_c;
reorderArgs.numGroupsPerKernel = 1;
reorderArgs.packetizeMode = 1;
int32_t reorderSize = MMALIB_CNN_convolveBias_row_processWeights_getMemorySize(
&reorderArgs, weights_padded);
if (reorderSize > (int)sizeof(weights_reordered))
return -4;
MMALIB_STATUS status = MMALIB_CNN_convolveBias_row_processWeights_reorder(
&reorderArgs, weights_padded, weights_reordered);
if (status != MMALIB_SUCCESS)
return 3000 + (int)status;
/* ---- MMALib structures ---- */
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_InitArgs initArgs;
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecInArgs execInArgs;
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecOutArgs execOutArgs;
MMALIB_bufParams2D_t src0_addr; /* Weights (A matrix) */
MMALIB_bufParams2D_t src1_addr; /* Input feature map (B matrix) */
MMALIB_bufParams2D_t src2_addr; /* Bias */
MMALIB_bufParams1D_t src3_addr; /* Scale */
MMALIB_bufParams3D_t dst_addr; /* Output feature map (C matrix) */
/*
* InitArgs -- following MMALib test case 1005 conventions:
* Fc = kernel width (spatial), NOT multiplied by in_c
* Fr = kernel height
* strideX/Y = spatial stride, NOT multiplied by in_c
* inWidth = spatial width of input feature map
* maxHeight = spatial height of input feature map
* inChOffset = pitch between input channel planes (>= spatial, 64-aligned)
* validColsIn = total valid spatial locations to process
*/
initArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
initArgs.Fc = k_w;
initArgs.Fr = k_h;
initArgs.strideX = stride_w;
initArgs.strideY = stride_h;
initArgs.dilationX = 1;
initArgs.dilationY = 1;
initArgs.inWidth = in_w;
initArgs.maxHeight = in_h;
initArgs.inChOffset = inChOffset;
initArgs.validColsIn = spatial; /* process all spatial locations */
initArgs.validColsPerRowIn = 0; /* 0 for LINEAR stride-1 */
initArgs.validRowsIn = 0; /* 0 for LINEAR stride-1 */
initArgs.inputPitchPerRow = 0; /* 0 for LINEAR stride-1 */
initArgs.outputPitchPerRow = 0; /* 0 for LINEAR stride-1 */
initArgs.No = out_c;
initArgs.subMChannels = out_c; /* process all output channels at once */
initArgs.numGroupsPerKernel = 1;
initArgs.bias = 1; /* bias enabled */
initArgs.activationType = MMALIB_RELU; /* ReLU clamps negative to 0 */
initArgs.pSatMin = 0;
initArgs.pSatMax = 0; /* 0 = use default saturation for data type */
initArgs.mode = MMALIB_LINEAR;
initArgs.col = 0;
initArgs.pad = 0;
initArgs.padTop = pad_top;
initArgs.padBottom = pad_bottom;
initArgs.padLeft = pad_left;
initArgs.padRight = pad_right;
initArgs.validColsOutBottom = 0;
initArgs.packetizeMode = 1; /* weights are reordered/packetized */
/*
* Buffer parameter structures
*
* src0 (weights): 2D [numOutChannels][kDim] with pitchA stride
* src1 (input): 2D [numInChannels][inChOffset]
* src2 (bias): 2D [1][numOutChannels] (int32 per output channel)
* src3 (scale): 1D [numOutChannels] (uint8 per channel)
* dst (output): 3D [numGroupsPerKernel][numOutChannels][pitchC]
*/
src0_addr.data_type = MMALIB_INT8;
src0_addr.dim_x = kDim;
src0_addr.dim_y = out_c;
src0_addr.stride_y = pitchA;
src1_addr.data_type = MMALIB_UINT8;
src1_addr.dim_x = inChOffset;
src1_addr.dim_y = in_c;
src1_addr.stride_y = inChOffset; /* * sizeof(uint8) = inChOffset bytes */
src2_addr.data_type = MMALIB_INT32;
src2_addr.dim_x = out_c;
src2_addr.dim_y = 1;
src2_addr.stride_y = out_c; /* in elements, not bytes — MMALib uses data_type for byte stride */
src3_addr.data_type = MMALIB_UINT8;
src3_addr.dim_x = out_c;
dst_addr.data_type = MMALIB_UINT8;
dst_addr.dim_x = pitchC;
dst_addr.dim_y = out_c;
dst_addr.stride_y = pitchC; /* bytes (uint8) */
dst_addr.dim_z = 1;
dst_addr.stride_z = out_c * pitchC;
/* Get Handle Size */
int32_t handleSize = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_getHandleSize(&initArgs);
if (handleSize > (int)sizeof(handle_buffer))
return -3;
/* Check params before init */
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init_checkParams(
handle_buffer,
&src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
&initArgs);
if (status != MMALIB_SUCCESS)
return 4000 + (int)status;
/* Init kernel handle */
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init(
handle_buffer,
&src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
&initArgs);
if (status != MMALIB_SUCCESS)
return 1000 + (int)status;
/* Fill ExecInArgs */
execInArgs.validColsIn = spatial;
execInArgs.validColsPerRowIn = 0;
execInArgs.validRowsIn = 0;
execInArgs.col = 0;
execInArgs.subMChannels = out_c;
execInArgs.quantMethod = 1; /* MMALIB_QM_PER_CHANNEL */
execInArgs.padFillValue = input_zp; /* padding fill = input zero point */
execInArgs.enableDynamicRange = 0;
execInArgs.initDynamicRange = 0;
/* Execute convolution */
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec(
handle_buffer,
weights_reordered,
input,
bias,
scale_buf,
shift_buf,
output,
&execInArgs,
&execOutArgs);
if (status != MMALIB_SUCCESS)
return 2000 + (int)status;
return 0;
}
/* Dummy main to satisfy linker/RTS requirements (we don't run _c_int00) */
int main(void) { return 0; }

File diff suppressed because it is too large Load diff

View file

@ -265,165 +265,319 @@ thames_ml_subgraph_create(struct pipe_context *pcontext,
input->zero_point, weights->zero_point, bias ? bias->zero_point : 0, output->zero_point);
/* Input dimensions: NHWC */
unsigned int in_h = input->dims[1];
unsigned int in_w = input->dims[2];
unsigned int in_c = input->dims[3];
unsigned in_h = input->dims[1];
unsigned in_w = input->dims[2];
unsigned in_c = input->dims[3];
/* Weight dimensions: TFLite uses OHWI format (output, height, width, input) */
unsigned int out_c = weights->dims[0];
unsigned int k_h = weights->dims[1];
unsigned int k_w = weights->dims[2];
unsigned int k_in_c = weights->dims[3];
unsigned out_c = weights->dims[0];
unsigned k_h = weights->dims[1];
unsigned k_w = weights->dims[2];
unsigned k_in_c = weights->dims[3];
/* Output dimensions: NHWC */
unsigned int out_h = output->dims[1];
unsigned int out_w = output->dims[2];
unsigned out_h = output->dims[1];
unsigned out_w = output->dims[2];
unsigned int stride_h = conv_op->conv.stride_y;
unsigned int stride_w = conv_op->conv.stride_x;
unsigned stride_h = conv_op->conv.stride_y;
unsigned stride_w = conv_op->conv.stride_x;
/* Calculate padding from input/output/kernel dimensions */
unsigned int pad_h, pad_w;
unsigned pad_top, pad_bottom, pad_left, pad_right;
if (conv_op->conv.padding_same) {
pad_h = ((out_h - 1) * stride_h + k_h - in_h) / 2;
pad_w = ((out_w - 1) * stride_w + k_w - in_w) / 2;
unsigned pad_h = ((out_h - 1) * stride_h + k_h - in_h);
unsigned pad_w = ((out_w - 1) * stride_w + k_w - in_w);
pad_top = pad_h / 2;
pad_bottom = pad_h - pad_top;
pad_left = pad_w / 2;
pad_right = pad_w - pad_left;
} else {
pad_h = 0;
pad_w = 0;
pad_top = pad_bottom = pad_left = pad_right = 0;
}
DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, pad=%u,%u -> %ux%ux%u output\n",
in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c, stride_h, stride_w, pad_h, pad_w, out_h, out_w, out_c);
DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, "
"pad=%u/%u/%u/%u -> %ux%ux%u output\n",
in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c,
stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right,
out_h, out_w, out_c);
/*
* Store convolution dimensions in subgraph for use during invoke/read.
* MMALib expects CHW layout, so we compute aligned pitches.
*/
unsigned spatial = in_w * in_h;
unsigned inChOffset = ALIGN_POT(spatial + 64, 64); /* extra padding (test 1005 pattern) */
unsigned outSpatial = out_w * out_h;
unsigned pitchC = ALIGN_POT(outSpatial + 64, 64); /* extra padding (test 1005 pattern) */
subgraph->conv_in_h = in_h;
subgraph->conv_in_w = in_w;
subgraph->conv_in_c = in_c;
subgraph->conv_out_h = out_h;
subgraph->conv_out_w = out_w;
subgraph->conv_out_c = out_c;
subgraph->conv_inChOffset = inChOffset;
subgraph->conv_pitchC = pitchC;
size_t input_size = in_h * in_w * in_c;
size_t weight_size = k_h * k_w * k_in_c * out_c;
size_t bias_size = out_c * sizeof(int32_t);
size_t output_size = out_h * out_w * out_c;
/* Allocate buffers - reuse coefs_rsrc for weights, create new one for bias if needed */
subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(input_size, 1024));
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(weight_size, 1024));
subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(output_size, 1024));
/* Create bias buffer */
subgraph->bias_rsrc = NULL;
if (bias && bias->resource) {
subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(bias_size, 1024));
}
/* Input buffer: CHW layout [in_c][inChOffset] */
size_t input_chw_size = (size_t)in_c * inChOffset;
subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
MAX2(input_chw_size, 1024));
/* Transpose weights from TFLite OHWI to kernel's expected HWIO layout */
assert(weights->resource);
signed char *weights_ohwi = malloc(weight_size);
signed char *weights_hwio = malloc(weight_size);
pipe_buffer_read(pcontext, weights->resource, 0, weight_size, weights_ohwi);
/* Transpose: OHWI -> HWIO */
for (unsigned oc = 0; oc < out_c; oc++) {
for (unsigned kh = 0; kh < k_h; kh++) {
for (unsigned kw = 0; kw < k_w; kw++) {
for (unsigned ic = 0; ic < k_in_c; ic++) {
unsigned ohwi_idx = oc * k_h * k_w * k_in_c + kh * k_w * k_in_c + kw * k_in_c + ic;
unsigned hwio_idx = kh * k_w * k_in_c * out_c + kw * k_in_c * out_c + ic * out_c + oc;
weights_hwio[hwio_idx] = weights_ohwi[ohwi_idx];
}
}
/* Output buffer: CHW layout [out_c][pitchC] */
size_t output_chw_size = (size_t)out_c * pitchC;
subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
MAX2(output_chw_size, 1024));
/*
* Weights: TFLite provides uint8 weights with a per-tensor zero_point.
* MMALib expects signed int8 weights with symmetric quantization (zp=0).
*
* When the TFLite weight_zp != 0, naively subtracting weight_zp and
* clamping to [-128,127] is lossy (e.g. weight_zp=133 clamps values
* 04). Instead, following TI's TIDL approach, we:
*
* 1. Dequantize weights to float: float_w = (uint8_w - weight_zp) * weight_scale
* 2. Re-quantize per-channel with symmetric int8 (zp=0):
* new_scale[c] = max(|float_w[c]|) / 127
* int8_w = round(float_w / new_scale[c]) always fits in [-127,127]
* 3. Compute per-channel bias adjustment and scale/shift using
* new_scale[c] instead of the original weight_scale.
*
* This eliminates clamping entirely and gives exact results.
*/
int input_zp = input->zero_point; /* typically 128 for uint8 */
int weight_zp = weights->zero_point; /* typically non-zero for uint8 weights */
int output_zp = output->zero_point; /* typically 0 */
DBG("Zero points: input_zp=%d, weight_zp=%d, output_zp=%d\n",
input_zp, weight_zp, output_zp);
/* Read raw uint8 weights from TFLite resource */
uint8_t *raw_weights = malloc(weight_size);
pipe_buffer_read(pcontext, weights->resource, 0, weight_size, raw_weights);
unsigned kDim = k_h * k_w * k_in_c; /* elements per output filter */
/*
* Step 1: Dequantize weights to float.
* float_weight = (uint8_weight - weight_zp) * weight_scale
*/
float *float_weights = malloc(weight_size * sizeof(float));
float orig_weight_scale = weights->scale;
for (unsigned o = 0; o < out_c; o++) {
for (unsigned i = 0; i < kDim; i++) {
float_weights[o * kDim + i] =
((float)raw_weights[o * kDim + i] - (float)weight_zp) * orig_weight_scale;
}
}
pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, weights_hwio);
free(weights_ohwi);
free(weights_hwio);
free(raw_weights);
/* Copy bias data from TFLite model */
if (subgraph->bias_rsrc) {
pipe_buffer_copy(pcontext, subgraph->bias_rsrc, bias->resource, 0, 0, bias_size);
/* Debug: print first few bias values */
int32_t *bias_data = malloc(bias_size);
pipe_buffer_read(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
DBG("Bias values (int32): %d %d %d %d...\n", bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
free(bias_data);
/*
* Step 2: Re-quantize per-channel with symmetric int8 (zp=0).
* For each output channel, find the max absolute float value,
* compute new_scale = max_abs / 127, and quantize.
* This guarantees all values fit in [-127, 127] with no clamping.
*/
int8_t *sym_weights = malloc(weight_size);
float *per_ch_weight_scale = malloc(out_c * sizeof(float));
int32_t *weight_sums = calloc(out_c, sizeof(int32_t));
for (unsigned o = 0; o < out_c; o++) {
/* Find max absolute value for this output channel */
float max_abs = 0.0f;
for (unsigned i = 0; i < kDim; i++) {
float absv = fabsf(float_weights[o * kDim + i]);
if (absv > max_abs)
max_abs = absv;
}
/* Compute per-channel symmetric scale */
if (max_abs > 0.0f) {
per_ch_weight_scale[o] = max_abs / 127.0f;
} else {
per_ch_weight_scale[o] = 1.0f; /* avoid division by zero */
}
/* Quantize to int8 — guaranteed no clamping needed */
int32_t sum = 0;
for (unsigned i = 0; i < kDim; i++) {
int val = (int)roundf(float_weights[o * kDim + i] / per_ch_weight_scale[o]);
/* Should always be in [-127, 127] but clamp defensively */
if (val > 127) val = 127;
if (val < -128) val = -128;
sym_weights[o * kDim + i] = (int8_t)val;
sum += sym_weights[o * kDim + i];
}
weight_sums[o] = sum;
}
free(float_weights);
if (DBG_ENABLED(THAMES_DBG_MSGS)) {
DBG("Symmetric re-quantization: weight_zp=%d, orig_scale=%f\n",
weight_zp, orig_weight_scale);
DBG("Per-channel weight scales (first 4): %f %f %f %f\n",
per_ch_weight_scale[0], per_ch_weight_scale[1],
per_ch_weight_scale[2], per_ch_weight_scale[3]);
DBG("First 8 symmetric weights: %d %d %d %d %d %d %d %d\n",
sym_weights[0], sym_weights[1], sym_weights[2], sym_weights[3],
sym_weights[4], sym_weights[5], sym_weights[6], sym_weights[7]);
DBG("Weight sums (first 4 channels): %d %d %d %d\n",
weight_sums[0], weight_sums[1], weight_sums[2], weight_sums[3]);
}
/* Build params buffer: input/weight/bias/output IOVAs + conv_params + quant IOVAs */
uint64_t main_params[6] = {
thames_resource(subgraph->input_rsrc)->iova,
thames_resource(subgraph->coefs_rsrc)->iova,
subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
thames_resource(subgraph->output_rsrc)->iova,
0, /* Will be filled with conv_params IOVA below */
0 /* Will be filled with quant IOVA below */
};
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
MAX2(weight_size, 1024));
pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, sym_weights);
free(sym_weights);
/* Create separate buffer for convolution parameters */
/*
* Bias: adjust to fold in input and output zero points.
*
* TI's TIDL formula (from tidl_import_quantize.cpp):
* nScale[o] = S_y / (S_x * S_w[o])
* finalBias[o] = originalBias[o] + (z_y * nScale[o] - z_x * Σ(weight_s8[o][i]))
*
* With per-channel symmetric re-quantization, S_w[o] = per_ch_weight_scale[o].
*/
subgraph->bias_rsrc = NULL;
if (bias && bias->resource) {
int32_t *bias_data = malloc(bias_size);
pipe_buffer_read(pcontext, bias->resource, 0, bias_size, bias_data);
for (unsigned o = 0; o < out_c; o++) {
/* nScale = S_y / (S_x * S_w[o]) — per-channel output-to-accumulator ratio */
double nScale = (double)output->scale /
((double)input->scale * (double)per_ch_weight_scale[o]);
double final_bias = (double)bias_data[o]
+ (output_zp * nScale)
- ((double)input_zp * weight_sums[o]);
double abs_bias = final_bias < 0 ? -final_bias : final_bias;
if (abs_bias > (double)2147483647) {
mesa_logw("Thames: bias overflow on channel %u, zeroing weights", o);
final_bias = final_bias / nScale;
final_bias = final_bias * (double)output->scale / (double)input->scale;
}
bias_data[o] = (int32_t)round(final_bias);
}
if (DBG_ENABLED(THAMES_DBG_MSGS)) {
DBG("Adjusted bias values (int32): %d %d %d %d...\n",
bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
}
subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
MAX2(bias_size, 1024));
pipe_buffer_write(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
free(bias_data);
}
free(weight_sums);
/* Store input_zp for padFillValue in the DSP kernel */
subgraph->conv_input_zp = input_zp;
subgraph->conv_output_zp = output_zp;
/*
* Quantization parameters for MMALib per-channel.
*
* MMALib uses per-channel uint8 scale and uint8 shift for output requantization:
* output[c] = clamp((acc[c] * scale[c] + rounding) >> shift[c], min, max)
*
* Per-channel scale ratio: scaleRatio[c] = S_y / (S_x * S_w[c])
* We find uint8 scale and uint8 shift per channel such that:
* scale / 2^shift scaleRatio[c]
*
* This follows TI's TIDL_getMMAv2_ScaleShiftAndError() algorithm.
*/
size_t quant_buf_size = 2 * out_c; /* [scale_0..scale_N-1][shift_0..shift_N-1] */
uint8_t *quant_data = calloc(1, quant_buf_size);
for (unsigned ch = 0; ch < out_c; ch++) {
double scale_ratio = (double)output->scale /
((double)input->scale * (double)per_ch_weight_scale[ch]);
/*
* Find best (scale, shift) pair using TI's brute-force approach:
* For each possible scale value (1..255), compute the optimal shift as
* shift = round(ln(scale / scaleRatio) / ln(2))
* then check the approximation error |scaleRatio - scale/2^shift|.
*/
uint8_t best_shift = 0;
uint8_t best_scale = 1;
double min_error = 1e30;
for (int s_iter = 1; s_iter <= 255; s_iter++) {
int shift_bits = (int)round(log((double)s_iter / scale_ratio) / log(2.0));
if (shift_bits > 40) shift_bits = 40;
if (shift_bits < 0) shift_bits = 0;
double approx = (double)s_iter / pow(2.0, shift_bits);
double err = fabs(scale_ratio - approx);
if (err < min_error) {
min_error = err;
best_shift = (uint8_t)shift_bits;
best_scale = (uint8_t)s_iter;
}
}
quant_data[ch] = best_scale;
quant_data[out_c + ch] = best_shift;
if (ch < 4 && DBG_ENABLED(THAMES_DBG_MSGS)) {
DBG("Channel %u: weight_scale=%f, scale_ratio=%f -> scale=%u, shift=%u (eff=%f, err=%e)\n",
ch, per_ch_weight_scale[ch], scale_ratio,
best_scale, best_shift, (double)best_scale / pow(2.0, best_shift), min_error);
}
}
free(per_ch_weight_scale);
subgraph->quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
MAX2(quant_buf_size, 64));
pipe_buffer_write(pcontext, subgraph->quant_rsrc, 0, quant_buf_size, quant_data);
free(quant_data);
/*
* Convolution parameters buffer.
* Layout must match what the DSP kernel expects (see test_kernel.c header).
*/
uint32_t conv_params[15] = {
in_h, in_w, in_c,
k_h, k_w,
out_h, out_w, out_c,
stride_h, stride_w,
pad_h, pad_w,
(uint32_t)input->zero_point,
(uint32_t)weights->zero_point,
(uint32_t)output->zero_point
pad_top, pad_bottom, pad_left, pad_right,
(uint32_t)input_zp, /* input zero point for padFillValue */
};
/* Compute fixed-point multiplier and shift for requantization.
* Use TFLite's QuantizeMultiplier approach: shift clamped to [0, 31],
* multiplier adjusted to fit.
subgraph->conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
sizeof(conv_params));
pipe_buffer_write(pcontext, subgraph->conv_params_rsrc, 0,
sizeof(conv_params), conv_params);
DBG("Conv params: in=%ux%ux%u k=%ux%u out=%ux%ux%u stride=%u,%u pad=%u/%u/%u/%u\n",
in_h, in_w, in_c, k_h, k_w, out_h, out_w, out_c,
stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right);
DBG("CHW layout: inChOffset=%u pitchC=%u input_chw_size=%zu output_chw_size=%zu\n",
inChOffset, pitchC, input_chw_size, output_chw_size);
/*
* Build main params buffer: 6 IOVAs that args[0..5] will point to.
*/
double effective_scale = (double)(input->scale * weights->scale) / (double)output->scale;
int exponent;
double significand = frexp(effective_scale, &exponent); /* significand in [0.5, 1.0) */
/* Compute target shift: 31 - exponent */
int target_shift = 31 - exponent;
uint32_t shift;
int32_t multiplier;
if (target_shift < 0) {
/* Scale >= 1.0, use shift=0 and scale down multiplier */
shift = 0;
multiplier = (int32_t)round(effective_scale * (1LL << 31));
} else if (target_shift > 31) {
/* Scale very small, clamp shift to 31 and scale down multiplier */
shift = 31;
/* multiplier = significand * 2^31 * 2^(31 - target_shift) */
double scaled_sig = significand * exp2(31 - target_shift);
multiplier = (int32_t)round(scaled_sig * (1LL << 31));
} else {
shift = target_shift;
multiplier = (int32_t)round(significand * (1LL << 31));
}
DBG("Quantization: effective_scale=%f, multiplier=%d (0x%x), shift=%u\n",
effective_scale, multiplier, multiplier, shift);
DBG("Input scale=%f, weight scale=%f, output scale=%f\n",
input->scale, weights->scale, output->scale);
DBG("Conv params being sent: input_zp=%u, weight_zp=%u, output_zp=%u\n",
(uint32_t)input->zero_point, (uint32_t)weights->zero_point, (uint32_t)output->zero_point);
DBG("Multiplier=%d (0x%x), shift=%u, output_zp=%u\n", multiplier, multiplier, shift, (uint32_t)output->zero_point);
/* Create buffer for quantization params (multiplier, shift) */
int32_t quant_params[2] = {
multiplier,
(int32_t)shift
uint64_t main_params[6] = {
thames_resource(subgraph->input_rsrc)->iova,
thames_resource(subgraph->coefs_rsrc)->iova,
subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
thames_resource(subgraph->output_rsrc)->iova,
thames_resource(subgraph->conv_params_rsrc)->iova,
thames_resource(subgraph->quant_rsrc)->iova,
};
struct pipe_resource *conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(conv_params));
pipe_buffer_write(pcontext, conv_params_rsrc, 0, sizeof(conv_params), conv_params);
main_params[4] = thames_resource(conv_params_rsrc)->iova;
struct pipe_resource *quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(quant_params));
pipe_buffer_write(pcontext, quant_rsrc, 0, sizeof(quant_params), quant_params);
main_params[5] = thames_resource(quant_rsrc)->iova;
subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(main_params));
pipe_buffer_write(pcontext, subgraph->params_rsrc, 0, sizeof(main_params), main_params);
pipe_resource_reference(&conv_params_rsrc, NULL);
pipe_resource_reference(&quant_rsrc, NULL);
subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
sizeof(main_params));
pipe_buffer_write(pcontext, subgraph->params_rsrc, 0,
sizeof(main_params), main_params);
return &subgraph->base;
}
@ -441,21 +595,50 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
struct timespec start, end;
int ret;
unsigned in_h = subgraph->conv_in_h;
unsigned in_w = subgraph->conv_in_w;
unsigned in_c = subgraph->conv_in_c;
unsigned inChOffset = subgraph->conv_inChOffset;
/*
* Transpose input from NHWC to CHW for MMALib.
*
* Source (NHWC): [H][W][C] contiguous, size = H * W * C
* Destination (CHW): [C][inChOffset] where inChOffset = ALIGN(H*W, 64)
*
* Each channel plane has H*W valid pixels followed by padding zeros.
*/
size_t input_chw_size = (size_t)in_c * inChOffset;
uint8_t *chw_buf = calloc(1, input_chw_size); /* calloc zeros the padding */
for (unsigned i = 0; i < inputs_count; i++) {
struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]);
assert(input);
const uint8_t *nhwc = (const uint8_t *)inputs[i];
for (unsigned c = 0; c < in_c; c++) {
for (unsigned h = 0; h < in_h; h++) {
for (unsigned w = 0; w < in_w; w++) {
unsigned nhwc_idx = (h * in_w + w) * in_c + c;
unsigned chw_idx = c * inChOffset + h * in_w + w;
chw_buf[chw_idx] = nhwc[nhwc_idx];
}
}
}
struct thames_tensor *input_tensor = thames_find_tensor(subgraph, input_idxs[i]);
assert(input_tensor);
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
thames_dump_buffer(inputs[i], "input-nhwc", 0, 0, 0, in_h * in_w * in_c);
pipe_buffer_write(pcontext, subgraph->input_rsrc, input->offset, input->size, inputs[i]);
pipe_buffer_write(pcontext, subgraph->input_rsrc, 0, input_chw_size, chw_buf);
}
free(chw_buf);
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->input_rsrc,
PIPE_MAP_READ, &transfer_in);
thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->input_rsrc));
thames_dump_buffer(buf, "input-chw", 0, 0, 0, input_chw_size);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
@ -465,13 +648,20 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
job.params = thames_resource(subgraph->params_rsrc)->handle;
job.params_size = pipe_buffer_size(subgraph->params_rsrc);
/* Pass both input buffers (input_a and input_b) as input BOs */
uint32_t in_bo_handles[2] = {
thames_resource(subgraph->input_rsrc)->handle,
thames_resource(subgraph->coefs_rsrc)->handle,
};
/*
* Pass all input BOs the kernel will access.
* The DRM driver needs to know about them for cache management.
*/
uint32_t in_bo_handles[4];
unsigned in_bo_count = 0;
in_bo_handles[in_bo_count++] = thames_resource(subgraph->input_rsrc)->handle;
in_bo_handles[in_bo_count++] = thames_resource(subgraph->coefs_rsrc)->handle;
if (subgraph->bias_rsrc)
in_bo_handles[in_bo_count++] = thames_resource(subgraph->bias_rsrc)->handle;
in_bo_handles[in_bo_count++] = thames_resource(subgraph->quant_rsrc)->handle;
job.in_bo_handles = (uintptr_t)in_bo_handles;
job.in_bo_handle_count = 2;
job.in_bo_handle_count = in_bo_count;
job.out_bo_handles = (uintptr_t)&thames_resource(subgraph->output_rsrc)->handle;
job.out_bo_handle_count = 1;
@ -499,11 +689,12 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
DBG("Input hex: %s\n", hexbuf);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_in);
struct pipe_transfer *transfer_coefs;
uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_coefs);
for (int i = 0; i < 32; i++)
snprintf(hexbuf + i * 3, 4, "%02x ", coefsbuf[i]);
DBG("Coefs hex: %s\n", hexbuf);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
pipe_buffer_unmap(subgraph->base.context, transfer_coefs);
/* Read output buffer */
struct pipe_transfer *transfer_out;
@ -531,21 +722,44 @@ thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
uint8_t **outputs = (uint8_t **)outputsv;
for (int i = 0; i < outputs_count; i++) {
struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]);
unsigned out_h = subgraph->conv_out_h;
unsigned out_w = subgraph->conv_out_w;
unsigned out_c = subgraph->conv_out_c;
unsigned pitchC = subgraph->conv_pitchC;
size_t output_chw_size = (size_t)out_c * pitchC;
for (int i = 0; i < outputs_count; i++) {
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->output_rsrc,
PIPE_MAP_READ, &transfer_in);
thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->output_rsrc));
thames_dump_buffer(buf, "output-chw", 0, 0, 0, output_chw_size);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
/* For test kernel phase: output is at offset 0, size is the buffer size */
unsigned size = pipe_buffer_size(subgraph->output_rsrc);
DBG("Reading output %u from offset 0, size %u\n", output_idxs[i], size);
pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, size, outputs[i]);
/*
* Read CHW output from DSP and transpose to NHWC for TFLite.
*
* Source (CHW): [out_c][pitchC] where each channel has out_h*out_w valid pixels
* Destination (NHWC): [out_h][out_w][out_c] contiguous
*/
uint8_t *chw_out = malloc(output_chw_size);
pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, output_chw_size, chw_out);
for (unsigned c = 0; c < out_c; c++) {
for (unsigned h = 0; h < out_h; h++) {
for (unsigned w = 0; w < out_w; w++) {
unsigned chw_idx = c * pitchC + h * out_w + w;
unsigned nhwc_idx = (h * out_w + w) * out_c + c;
outputs[i][nhwc_idx] = chw_out[chw_idx];
}
}
}
free(chw_out);
DBG("Reading output %u: CHW %ux%u -> NHWC %ux%ux%u (%u bytes)\n",
output_idxs[i], out_c, pitchC, out_h, out_w, out_c, out_h * out_w * out_c);
}
}
@ -561,6 +775,8 @@ thames_ml_subgraph_destroy(struct pipe_context *pcontext,
pipe_resource_reference(&subgraph->bias_rsrc, NULL);
pipe_resource_reference(&subgraph->kernel_rsrc, NULL);
pipe_resource_reference(&subgraph->params_rsrc, NULL);
pipe_resource_reference(&subgraph->quant_rsrc, NULL);
pipe_resource_reference(&subgraph->conv_params_rsrc, NULL);
util_dynarray_fini(&subgraph->operations);
util_dynarray_fini(&subgraph->tensors);

View file

@ -186,8 +186,18 @@ struct thames_subgraph {
uint8_t *coefs;
struct pipe_resource *coefs_rsrc;
unsigned coefs_used;
struct pipe_resource *bias_rsrc; /* Bias tensor (int32) */
struct pipe_resource *quant_rsrc; /* Scale + shift (uint8 packed) */
struct pipe_resource *conv_params_rsrc; /* Convolution parameters */
/* Convolution dimensions (needed for NHWC<->CHW transpose at invoke/read) */
unsigned conv_in_h, conv_in_w, conv_in_c;
unsigned conv_out_h, conv_out_w, conv_out_c;
unsigned conv_inChOffset; /* ALIGN(in_h*in_w + 64, 64) */
unsigned conv_pitchC; /* ALIGN(out_h*out_w + 64, 64) */
int conv_input_zp; /* Input zero point (for padFillValue) */
int conv_output_zp; /* Output zero point (post-add) */
};
bool