mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-12 13:20:33 +01:00
Use MMALib
This commit is contained in:
parent
b8319435ef
commit
cbc347c3ee
5 changed files with 29426 additions and 329 deletions
|
|
@ -27,6 +27,10 @@ scp "${KERNEL_SRC}" "${BOARD_USER}@${BOARD_IP}:/tmp/test_kernel.c"
|
|||
# Create linker command file on board to ensure proper section placement
|
||||
echo "[1.5/5] Creating linker.cmd on board..."
|
||||
ssh "${BOARD_USER}@${BOARD_IP}" "cat > /tmp/linker.cmd <<EOF
|
||||
MEMORY
|
||||
{
|
||||
L2SRAM (RWX) : origin = 0x0, length = 0x200000
|
||||
}
|
||||
SECTIONS
|
||||
{
|
||||
.text : {
|
||||
|
|
@ -34,7 +38,10 @@ SECTIONS
|
|||
*(.text)
|
||||
*(.const)
|
||||
*(.switch)
|
||||
}
|
||||
*(.kernel_data)
|
||||
*(.data)
|
||||
*(.bss)
|
||||
} > L2SRAM
|
||||
}
|
||||
EOF"
|
||||
|
||||
|
|
@ -50,6 +57,8 @@ ssh "${BOARD_USER}@${BOARD_IP}" "cd /tmp && ${CL7X_PATH} -mv7524 --abi=eabi -O0
|
|||
--reread_libs --warn_sections \
|
||||
--rom_model \
|
||||
-l${MMALIB_LIB} \
|
||||
-l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/mmalib_cn_C7524.lib \
|
||||
-l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/common_C7524.lib \
|
||||
-lrts7524_le.lib \
|
||||
linker.cmd \
|
||||
-o test_kernel.out"
|
||||
|
|
|
|||
|
|
@ -4,11 +4,20 @@
|
|||
* This kernel uses TI's MMALib to perform quantized int8 convolution
|
||||
* accelerated by the C7x Matrix Multiply Accelerator (MMA) hardware.
|
||||
*
|
||||
* Data layout contract with Mesa (CPU) side:
|
||||
* - Input arrives in CHW layout (transposed from NHWC by Mesa)
|
||||
* Layout: [numInChannels][inChOffset] where inChOffset = ALIGN(H*W, 64)
|
||||
* - Weights arrive in OHWI layout (TFLite native, no transpose needed)
|
||||
* Layout: [numOutChannels][kH * kW * numInChannels]
|
||||
* - Bias: [numOutChannels] int32 values
|
||||
* - Output produced in CHW layout (Mesa transposes back to NHWC)
|
||||
* Layout: [numOutChannels][pitchC] where pitchC = ALIGN(outH*outW, 64)
|
||||
*
|
||||
* Args:
|
||||
* args[0] = input tensor (uint8, NHWC layout)
|
||||
* args[1] = weight tensor (uint8, HWIO layout)
|
||||
* args[0] = input tensor (uint8, CHW layout, padded to inChOffset per channel)
|
||||
* args[1] = weight tensor (int8, OHWI layout = [Cout][Cin*Fr*Fc])
|
||||
* args[2] = bias tensor (int32, per output channel)
|
||||
* args[3] = output tensor (uint8, NHWC layout)
|
||||
* args[3] = output tensor (uint8, CHW layout, pitchC per channel)
|
||||
* args[4] = params struct pointer (uint32):
|
||||
* params[0] = input_height
|
||||
* params[1] = input_width
|
||||
|
|
@ -20,14 +29,14 @@
|
|||
* params[7] = output_channels
|
||||
* params[8] = stride_h
|
||||
* params[9] = stride_w
|
||||
* params[10] = pad_h
|
||||
* params[11] = pad_w
|
||||
* params[12] = input_zero_point (unused - MMALib handles this internally)
|
||||
* params[13] = weight_zero_point (unused - MMALib handles this internally)
|
||||
* params[14] = output_zero_point (unused - MMALib handles this internally)
|
||||
* args[5] = quantization params (int32):
|
||||
* quant[0] = multiplier (int32)
|
||||
* quant[1] = shift (int32)
|
||||
* params[10] = pad_top
|
||||
* params[11] = pad_bottom
|
||||
* params[12] = pad_left
|
||||
* params[13] = pad_right
|
||||
* params[14] = input_zero_point (for padFillValue)
|
||||
* args[5] = quantization params (uint8 packed):
|
||||
* quant[0..out_c-1] = scale values (uint8)
|
||||
* quant[out_c..2*out_c-1] = shift values (uint8)
|
||||
*
|
||||
* Compile with:
|
||||
* See compile-kernel.sh for build procedure
|
||||
|
|
@ -38,68 +47,281 @@
|
|||
/* MMALib headers - MMA hardware acceleration library */
|
||||
#include "mmalib.h"
|
||||
|
||||
/* Align a value up to the next multiple of 'align' (align must be power of 2) */
|
||||
#define ALIGN_UP(val, align) (((val) + (align) - 1) & ~((align) - 1))
|
||||
|
||||
/*
|
||||
* Quantized int8 convolution kernel using C7x MMA hardware via MMALib
|
||||
* Quantized convolution kernel using C7x MMA hardware via MMALib
|
||||
*
|
||||
* This is the entry point called from the DRM driver.
|
||||
* Note: This code runs bare-metal on the DSP, so no stdlib functions.
|
||||
* Placed in .text.entry section to ensure it's at the beginning of the binary.
|
||||
*/
|
||||
|
||||
/* Static buffers for handle and packed weights */
|
||||
/* Placed in .kernel_data section so they are uploaded with the kernel code */
|
||||
#pragma DATA_ALIGN(handle_buffer, 128)
|
||||
__attribute__((section(".kernel_data")))
|
||||
static uint8_t handle_buffer[16384];
|
||||
|
||||
/* Weights padded to pitchA stride between output channels */
|
||||
#pragma DATA_ALIGN(weights_padded, 128)
|
||||
__attribute__((section(".kernel_data")))
|
||||
static int8_t weights_padded[65536];
|
||||
|
||||
/* Weights after MMALib reorder for MMA-friendly layout */
|
||||
#pragma DATA_ALIGN(weights_reordered, 128)
|
||||
__attribute__((section(".kernel_data")))
|
||||
static int8_t weights_reordered[65536];
|
||||
|
||||
/* Per-channel scale (uint8) and shift (uint8) for requantization */
|
||||
#pragma DATA_ALIGN(scale_buf, 128)
|
||||
__attribute__((section(".kernel_data")))
|
||||
static uint8_t scale_buf[2048];
|
||||
|
||||
#pragma DATA_ALIGN(shift_buf, 128)
|
||||
__attribute__((section(".kernel_data")))
|
||||
static uint8_t shift_buf[2048];
|
||||
|
||||
|
||||
#pragma RETAIN(test_kernel)
|
||||
__attribute__((section(".text.entry")))
|
||||
int
|
||||
test_kernel(unsigned long long *args)
|
||||
{
|
||||
unsigned char *input = (unsigned char *)args[0];
|
||||
unsigned char *weights = (unsigned char *)args[1];
|
||||
int *bias = (int *)args[2];
|
||||
unsigned char *output = (unsigned char *)args[3];
|
||||
unsigned int *params = (unsigned int *)args[4];
|
||||
int *quant = (int *)args[5];
|
||||
unsigned char *input = (unsigned char *)args[0];
|
||||
signed char *weights = (signed char *)args[1];
|
||||
int *bias = (int *)args[2];
|
||||
unsigned char *output = (unsigned char *)args[3];
|
||||
unsigned int *params = (unsigned int *)args[4];
|
||||
unsigned char *quant = (unsigned char *)args[5];
|
||||
|
||||
/* Extract convolution parameters */
|
||||
unsigned int in_h = params[0];
|
||||
unsigned int in_w = params[1];
|
||||
unsigned int in_c = params[2];
|
||||
unsigned int k_h = params[3];
|
||||
unsigned int k_w = params[4];
|
||||
unsigned int out_h = params[5];
|
||||
unsigned int out_w = params[6];
|
||||
unsigned int out_c = params[7];
|
||||
unsigned int stride_h = params[8];
|
||||
unsigned int stride_w = params[9];
|
||||
int pad_h = (int)params[10];
|
||||
int pad_w = (int)params[11];
|
||||
|
||||
/* Extract quantization parameters */
|
||||
int multiplier = quant[0];
|
||||
int shift = quant[1];
|
||||
|
||||
/* Basic validation */
|
||||
if (!input || !weights || !bias || !output || !params || !quant) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (in_w == 0 || in_h == 0 || in_c == 0 ||
|
||||
k_w == 0 || k_h == 0 || out_c == 0 ||
|
||||
stride_w == 0 || stride_h == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* TODO: Call MMALib convolution function
|
||||
*
|
||||
* This requires:
|
||||
* 1. Allocating a kernel handle
|
||||
* 2. Setting up MMALIB_bufParams structures
|
||||
* 3. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init()
|
||||
* 4. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec()
|
||||
*
|
||||
* For now, return success to test static linking of mmalib_C7524.lib
|
||||
*/
|
||||
|
||||
return 0; /* Success */
|
||||
/* Extract convolution parameters */
|
||||
unsigned int in_h = params[0];
|
||||
unsigned int in_w = params[1];
|
||||
unsigned int in_c = params[2];
|
||||
unsigned int k_h = params[3];
|
||||
unsigned int k_w = params[4];
|
||||
unsigned int out_h = params[5];
|
||||
unsigned int out_w = params[6];
|
||||
unsigned int out_c = params[7];
|
||||
unsigned int stride_h = params[8];
|
||||
unsigned int stride_w = params[9];
|
||||
int pad_top = (int)params[10];
|
||||
int pad_bottom = (int)params[11];
|
||||
int pad_left = (int)params[12];
|
||||
int pad_right = (int)params[13];
|
||||
int input_zp = (int)params[14];
|
||||
|
||||
/* Basic validation */
|
||||
if (!input || !weights || !bias || !output || !params || !quant)
|
||||
return -1;
|
||||
|
||||
if (in_w == 0 || in_h == 0 || in_c == 0 ||
|
||||
k_w == 0 || k_h == 0 || out_c == 0 ||
|
||||
stride_w == 0 || stride_h == 0)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* Derived parameters for MMALib (following test case 1005 pattern)
|
||||
*
|
||||
* MMALib convolveBias_row operates on channel-first data:
|
||||
* A matrix (src0) = weights: [numOutChannels][kDim] with stride pitchA
|
||||
* B matrix (src1) = input: [numInChannels][inChOffset]
|
||||
* C matrix (dst) = output: [numOutChannels][pitchC]
|
||||
*/
|
||||
int kDim = k_h * k_w * in_c; /* elements per filter */
|
||||
int pitchA = ALIGN_UP(kDim, 64); /* 64-byte aligned weight row stride */
|
||||
int spatial = in_w * in_h; /* total spatial locations per channel */
|
||||
int inChOffset = ALIGN_UP(spatial + 64, 64); /* channel stride with extra padding (test 1005 pattern) */
|
||||
int outSpatial = out_w * out_h;
|
||||
int pitchC = ALIGN_UP(outSpatial + 64, 64); /* output channel stride with extra padding */
|
||||
|
||||
/*
|
||||
* Copy per-channel scale and shift from the quant buffer.
|
||||
* Mesa packs them as: [scale0..scaleN-1][shift0..shiftN-1], both uint8.
|
||||
*/
|
||||
if (out_c > 2048)
|
||||
return -2;
|
||||
for (unsigned int i = 0; i < out_c; i++) {
|
||||
scale_buf[i] = quant[i];
|
||||
shift_buf[i] = quant[out_c + i];
|
||||
}
|
||||
|
||||
/*
|
||||
* Pad weights from OHWI [out_c][kDim] contiguous to [out_c][pitchA] with
|
||||
* zero-padding between rows for 64-byte alignment.
|
||||
*/
|
||||
int weight_padded_size = out_c * pitchA;
|
||||
if (weight_padded_size > (int)sizeof(weights_padded))
|
||||
return -2;
|
||||
|
||||
for (unsigned int o = 0; o < out_c; o++) {
|
||||
for (int j = 0; j < kDim; j++)
|
||||
weights_padded[o * pitchA + j] = weights[o * kDim + j];
|
||||
for (int j = kDim; j < pitchA; j++)
|
||||
weights_padded[o * pitchA + j] = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Weight reorder: rearrange from natural OHWI to MMA-friendly tiled layout.
|
||||
* Must happen before init.
|
||||
*/
|
||||
MMALIB_CNN_convolveBias_row_processWeights_Args reorderArgs;
|
||||
reorderArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
|
||||
reorderArgs.data_type = MMALIB_UINT8;
|
||||
reorderArgs.Fr = k_h;
|
||||
reorderArgs.Fc = k_w;
|
||||
reorderArgs.pitchA = pitchA;
|
||||
reorderArgs.numInChPerGroup = in_c;
|
||||
reorderArgs.subMChannels = out_c;
|
||||
reorderArgs.No = out_c;
|
||||
reorderArgs.numGroupsPerKernel = 1;
|
||||
reorderArgs.packetizeMode = 1;
|
||||
|
||||
int32_t reorderSize = MMALIB_CNN_convolveBias_row_processWeights_getMemorySize(
|
||||
&reorderArgs, weights_padded);
|
||||
if (reorderSize > (int)sizeof(weights_reordered))
|
||||
return -4;
|
||||
|
||||
MMALIB_STATUS status = MMALIB_CNN_convolveBias_row_processWeights_reorder(
|
||||
&reorderArgs, weights_padded, weights_reordered);
|
||||
if (status != MMALIB_SUCCESS)
|
||||
return 3000 + (int)status;
|
||||
|
||||
/* ---- MMALib structures ---- */
|
||||
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_InitArgs initArgs;
|
||||
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecInArgs execInArgs;
|
||||
MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecOutArgs execOutArgs;
|
||||
|
||||
MMALIB_bufParams2D_t src0_addr; /* Weights (A matrix) */
|
||||
MMALIB_bufParams2D_t src1_addr; /* Input feature map (B matrix) */
|
||||
MMALIB_bufParams2D_t src2_addr; /* Bias */
|
||||
MMALIB_bufParams1D_t src3_addr; /* Scale */
|
||||
MMALIB_bufParams3D_t dst_addr; /* Output feature map (C matrix) */
|
||||
|
||||
/*
|
||||
* InitArgs -- following MMALib test case 1005 conventions:
|
||||
* Fc = kernel width (spatial), NOT multiplied by in_c
|
||||
* Fr = kernel height
|
||||
* strideX/Y = spatial stride, NOT multiplied by in_c
|
||||
* inWidth = spatial width of input feature map
|
||||
* maxHeight = spatial height of input feature map
|
||||
* inChOffset = pitch between input channel planes (>= spatial, 64-aligned)
|
||||
* validColsIn = total valid spatial locations to process
|
||||
*/
|
||||
initArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
|
||||
initArgs.Fc = k_w;
|
||||
initArgs.Fr = k_h;
|
||||
initArgs.strideX = stride_w;
|
||||
initArgs.strideY = stride_h;
|
||||
initArgs.dilationX = 1;
|
||||
initArgs.dilationY = 1;
|
||||
initArgs.inWidth = in_w;
|
||||
initArgs.maxHeight = in_h;
|
||||
initArgs.inChOffset = inChOffset;
|
||||
initArgs.validColsIn = spatial; /* process all spatial locations */
|
||||
initArgs.validColsPerRowIn = 0; /* 0 for LINEAR stride-1 */
|
||||
initArgs.validRowsIn = 0; /* 0 for LINEAR stride-1 */
|
||||
initArgs.inputPitchPerRow = 0; /* 0 for LINEAR stride-1 */
|
||||
initArgs.outputPitchPerRow = 0; /* 0 for LINEAR stride-1 */
|
||||
initArgs.No = out_c;
|
||||
initArgs.subMChannels = out_c; /* process all output channels at once */
|
||||
initArgs.numGroupsPerKernel = 1;
|
||||
initArgs.bias = 1; /* bias enabled */
|
||||
initArgs.activationType = MMALIB_RELU; /* ReLU clamps negative to 0 */
|
||||
initArgs.pSatMin = 0;
|
||||
initArgs.pSatMax = 0; /* 0 = use default saturation for data type */
|
||||
initArgs.mode = MMALIB_LINEAR;
|
||||
initArgs.col = 0;
|
||||
initArgs.pad = 0;
|
||||
initArgs.padTop = pad_top;
|
||||
initArgs.padBottom = pad_bottom;
|
||||
initArgs.padLeft = pad_left;
|
||||
initArgs.padRight = pad_right;
|
||||
initArgs.validColsOutBottom = 0;
|
||||
initArgs.packetizeMode = 1; /* weights are reordered/packetized */
|
||||
|
||||
/*
|
||||
* Buffer parameter structures
|
||||
*
|
||||
* src0 (weights): 2D [numOutChannels][kDim] with pitchA stride
|
||||
* src1 (input): 2D [numInChannels][inChOffset]
|
||||
* src2 (bias): 2D [1][numOutChannels] (int32 per output channel)
|
||||
* src3 (scale): 1D [numOutChannels] (uint8 per channel)
|
||||
* dst (output): 3D [numGroupsPerKernel][numOutChannels][pitchC]
|
||||
*/
|
||||
src0_addr.data_type = MMALIB_INT8;
|
||||
src0_addr.dim_x = kDim;
|
||||
src0_addr.dim_y = out_c;
|
||||
src0_addr.stride_y = pitchA;
|
||||
|
||||
src1_addr.data_type = MMALIB_UINT8;
|
||||
src1_addr.dim_x = inChOffset;
|
||||
src1_addr.dim_y = in_c;
|
||||
src1_addr.stride_y = inChOffset; /* * sizeof(uint8) = inChOffset bytes */
|
||||
|
||||
src2_addr.data_type = MMALIB_INT32;
|
||||
src2_addr.dim_x = out_c;
|
||||
src2_addr.dim_y = 1;
|
||||
src2_addr.stride_y = out_c; /* in elements, not bytes — MMALib uses data_type for byte stride */
|
||||
|
||||
src3_addr.data_type = MMALIB_UINT8;
|
||||
src3_addr.dim_x = out_c;
|
||||
|
||||
dst_addr.data_type = MMALIB_UINT8;
|
||||
dst_addr.dim_x = pitchC;
|
||||
dst_addr.dim_y = out_c;
|
||||
dst_addr.stride_y = pitchC; /* bytes (uint8) */
|
||||
dst_addr.dim_z = 1;
|
||||
dst_addr.stride_z = out_c * pitchC;
|
||||
|
||||
/* Get Handle Size */
|
||||
int32_t handleSize = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_getHandleSize(&initArgs);
|
||||
if (handleSize > (int)sizeof(handle_buffer))
|
||||
return -3;
|
||||
|
||||
/* Check params before init */
|
||||
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init_checkParams(
|
||||
handle_buffer,
|
||||
&src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
|
||||
&initArgs);
|
||||
if (status != MMALIB_SUCCESS)
|
||||
return 4000 + (int)status;
|
||||
|
||||
/* Init kernel handle */
|
||||
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init(
|
||||
handle_buffer,
|
||||
&src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
|
||||
&initArgs);
|
||||
if (status != MMALIB_SUCCESS)
|
||||
return 1000 + (int)status;
|
||||
|
||||
/* Fill ExecInArgs */
|
||||
execInArgs.validColsIn = spatial;
|
||||
execInArgs.validColsPerRowIn = 0;
|
||||
execInArgs.validRowsIn = 0;
|
||||
execInArgs.col = 0;
|
||||
execInArgs.subMChannels = out_c;
|
||||
execInArgs.quantMethod = 1; /* MMALIB_QM_PER_CHANNEL */
|
||||
execInArgs.padFillValue = input_zp; /* padding fill = input zero point */
|
||||
execInArgs.enableDynamicRange = 0;
|
||||
execInArgs.initDynamicRange = 0;
|
||||
|
||||
/* Execute convolution */
|
||||
status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec(
|
||||
handle_buffer,
|
||||
weights_reordered,
|
||||
input,
|
||||
bias,
|
||||
scale_buf,
|
||||
shift_buf,
|
||||
output,
|
||||
&execInArgs,
|
||||
&execOutArgs);
|
||||
if (status != MMALIB_SUCCESS)
|
||||
return 2000 + (int)status;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Dummy main to satisfy linker/RTS requirements (we don't run _c_int00) */
|
||||
int main(void) { return 0; }
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -265,165 +265,319 @@ thames_ml_subgraph_create(struct pipe_context *pcontext,
|
|||
input->zero_point, weights->zero_point, bias ? bias->zero_point : 0, output->zero_point);
|
||||
|
||||
/* Input dimensions: NHWC */
|
||||
unsigned int in_h = input->dims[1];
|
||||
unsigned int in_w = input->dims[2];
|
||||
unsigned int in_c = input->dims[3];
|
||||
unsigned in_h = input->dims[1];
|
||||
unsigned in_w = input->dims[2];
|
||||
unsigned in_c = input->dims[3];
|
||||
|
||||
/* Weight dimensions: TFLite uses OHWI format (output, height, width, input) */
|
||||
unsigned int out_c = weights->dims[0];
|
||||
unsigned int k_h = weights->dims[1];
|
||||
unsigned int k_w = weights->dims[2];
|
||||
unsigned int k_in_c = weights->dims[3];
|
||||
unsigned out_c = weights->dims[0];
|
||||
unsigned k_h = weights->dims[1];
|
||||
unsigned k_w = weights->dims[2];
|
||||
unsigned k_in_c = weights->dims[3];
|
||||
|
||||
/* Output dimensions: NHWC */
|
||||
unsigned int out_h = output->dims[1];
|
||||
unsigned int out_w = output->dims[2];
|
||||
unsigned out_h = output->dims[1];
|
||||
unsigned out_w = output->dims[2];
|
||||
|
||||
unsigned int stride_h = conv_op->conv.stride_y;
|
||||
unsigned int stride_w = conv_op->conv.stride_x;
|
||||
unsigned stride_h = conv_op->conv.stride_y;
|
||||
unsigned stride_w = conv_op->conv.stride_x;
|
||||
|
||||
/* Calculate padding from input/output/kernel dimensions */
|
||||
unsigned int pad_h, pad_w;
|
||||
unsigned pad_top, pad_bottom, pad_left, pad_right;
|
||||
if (conv_op->conv.padding_same) {
|
||||
pad_h = ((out_h - 1) * stride_h + k_h - in_h) / 2;
|
||||
pad_w = ((out_w - 1) * stride_w + k_w - in_w) / 2;
|
||||
unsigned pad_h = ((out_h - 1) * stride_h + k_h - in_h);
|
||||
unsigned pad_w = ((out_w - 1) * stride_w + k_w - in_w);
|
||||
pad_top = pad_h / 2;
|
||||
pad_bottom = pad_h - pad_top;
|
||||
pad_left = pad_w / 2;
|
||||
pad_right = pad_w - pad_left;
|
||||
} else {
|
||||
pad_h = 0;
|
||||
pad_w = 0;
|
||||
pad_top = pad_bottom = pad_left = pad_right = 0;
|
||||
}
|
||||
|
||||
DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, pad=%u,%u -> %ux%ux%u output\n",
|
||||
in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c, stride_h, stride_w, pad_h, pad_w, out_h, out_w, out_c);
|
||||
DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, "
|
||||
"pad=%u/%u/%u/%u -> %ux%ux%u output\n",
|
||||
in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c,
|
||||
stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right,
|
||||
out_h, out_w, out_c);
|
||||
|
||||
/*
|
||||
* Store convolution dimensions in subgraph for use during invoke/read.
|
||||
* MMALib expects CHW layout, so we compute aligned pitches.
|
||||
*/
|
||||
unsigned spatial = in_w * in_h;
|
||||
unsigned inChOffset = ALIGN_POT(spatial + 64, 64); /* extra padding (test 1005 pattern) */
|
||||
unsigned outSpatial = out_w * out_h;
|
||||
unsigned pitchC = ALIGN_POT(outSpatial + 64, 64); /* extra padding (test 1005 pattern) */
|
||||
|
||||
subgraph->conv_in_h = in_h;
|
||||
subgraph->conv_in_w = in_w;
|
||||
subgraph->conv_in_c = in_c;
|
||||
subgraph->conv_out_h = out_h;
|
||||
subgraph->conv_out_w = out_w;
|
||||
subgraph->conv_out_c = out_c;
|
||||
subgraph->conv_inChOffset = inChOffset;
|
||||
subgraph->conv_pitchC = pitchC;
|
||||
|
||||
size_t input_size = in_h * in_w * in_c;
|
||||
size_t weight_size = k_h * k_w * k_in_c * out_c;
|
||||
size_t bias_size = out_c * sizeof(int32_t);
|
||||
size_t output_size = out_h * out_w * out_c;
|
||||
|
||||
/* Allocate buffers - reuse coefs_rsrc for weights, create new one for bias if needed */
|
||||
subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(input_size, 1024));
|
||||
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(weight_size, 1024));
|
||||
subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(output_size, 1024));
|
||||
|
||||
/* Create bias buffer */
|
||||
subgraph->bias_rsrc = NULL;
|
||||
if (bias && bias->resource) {
|
||||
subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(bias_size, 1024));
|
||||
}
|
||||
/* Input buffer: CHW layout [in_c][inChOffset] */
|
||||
size_t input_chw_size = (size_t)in_c * inChOffset;
|
||||
subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
MAX2(input_chw_size, 1024));
|
||||
|
||||
/* Transpose weights from TFLite OHWI to kernel's expected HWIO layout */
|
||||
assert(weights->resource);
|
||||
signed char *weights_ohwi = malloc(weight_size);
|
||||
signed char *weights_hwio = malloc(weight_size);
|
||||
|
||||
pipe_buffer_read(pcontext, weights->resource, 0, weight_size, weights_ohwi);
|
||||
|
||||
/* Transpose: OHWI -> HWIO */
|
||||
for (unsigned oc = 0; oc < out_c; oc++) {
|
||||
for (unsigned kh = 0; kh < k_h; kh++) {
|
||||
for (unsigned kw = 0; kw < k_w; kw++) {
|
||||
for (unsigned ic = 0; ic < k_in_c; ic++) {
|
||||
unsigned ohwi_idx = oc * k_h * k_w * k_in_c + kh * k_w * k_in_c + kw * k_in_c + ic;
|
||||
unsigned hwio_idx = kh * k_w * k_in_c * out_c + kw * k_in_c * out_c + ic * out_c + oc;
|
||||
weights_hwio[hwio_idx] = weights_ohwi[ohwi_idx];
|
||||
}
|
||||
}
|
||||
/* Output buffer: CHW layout [out_c][pitchC] */
|
||||
size_t output_chw_size = (size_t)out_c * pitchC;
|
||||
subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
MAX2(output_chw_size, 1024));
|
||||
|
||||
/*
|
||||
* Weights: TFLite provides uint8 weights with a per-tensor zero_point.
|
||||
* MMALib expects signed int8 weights with symmetric quantization (zp=0).
|
||||
*
|
||||
* When the TFLite weight_zp != 0, naively subtracting weight_zp and
|
||||
* clamping to [-128,127] is lossy (e.g. weight_zp=133 clamps values
|
||||
* 0–4). Instead, following TI's TIDL approach, we:
|
||||
*
|
||||
* 1. Dequantize weights to float: float_w = (uint8_w - weight_zp) * weight_scale
|
||||
* 2. Re-quantize per-channel with symmetric int8 (zp=0):
|
||||
* new_scale[c] = max(|float_w[c]|) / 127
|
||||
* int8_w = round(float_w / new_scale[c]) — always fits in [-127,127]
|
||||
* 3. Compute per-channel bias adjustment and scale/shift using
|
||||
* new_scale[c] instead of the original weight_scale.
|
||||
*
|
||||
* This eliminates clamping entirely and gives exact results.
|
||||
*/
|
||||
int input_zp = input->zero_point; /* typically 128 for uint8 */
|
||||
int weight_zp = weights->zero_point; /* typically non-zero for uint8 weights */
|
||||
int output_zp = output->zero_point; /* typically 0 */
|
||||
|
||||
DBG("Zero points: input_zp=%d, weight_zp=%d, output_zp=%d\n",
|
||||
input_zp, weight_zp, output_zp);
|
||||
|
||||
/* Read raw uint8 weights from TFLite resource */
|
||||
uint8_t *raw_weights = malloc(weight_size);
|
||||
pipe_buffer_read(pcontext, weights->resource, 0, weight_size, raw_weights);
|
||||
|
||||
unsigned kDim = k_h * k_w * k_in_c; /* elements per output filter */
|
||||
|
||||
/*
|
||||
* Step 1: Dequantize weights to float.
|
||||
* float_weight = (uint8_weight - weight_zp) * weight_scale
|
||||
*/
|
||||
float *float_weights = malloc(weight_size * sizeof(float));
|
||||
float orig_weight_scale = weights->scale;
|
||||
|
||||
for (unsigned o = 0; o < out_c; o++) {
|
||||
for (unsigned i = 0; i < kDim; i++) {
|
||||
float_weights[o * kDim + i] =
|
||||
((float)raw_weights[o * kDim + i] - (float)weight_zp) * orig_weight_scale;
|
||||
}
|
||||
}
|
||||
|
||||
pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, weights_hwio);
|
||||
free(weights_ohwi);
|
||||
free(weights_hwio);
|
||||
free(raw_weights);
|
||||
|
||||
/* Copy bias data from TFLite model */
|
||||
if (subgraph->bias_rsrc) {
|
||||
pipe_buffer_copy(pcontext, subgraph->bias_rsrc, bias->resource, 0, 0, bias_size);
|
||||
|
||||
/* Debug: print first few bias values */
|
||||
int32_t *bias_data = malloc(bias_size);
|
||||
pipe_buffer_read(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
|
||||
DBG("Bias values (int32): %d %d %d %d...\n", bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
|
||||
free(bias_data);
|
||||
/*
|
||||
* Step 2: Re-quantize per-channel with symmetric int8 (zp=0).
|
||||
* For each output channel, find the max absolute float value,
|
||||
* compute new_scale = max_abs / 127, and quantize.
|
||||
* This guarantees all values fit in [-127, 127] with no clamping.
|
||||
*/
|
||||
int8_t *sym_weights = malloc(weight_size);
|
||||
float *per_ch_weight_scale = malloc(out_c * sizeof(float));
|
||||
int32_t *weight_sums = calloc(out_c, sizeof(int32_t));
|
||||
|
||||
for (unsigned o = 0; o < out_c; o++) {
|
||||
/* Find max absolute value for this output channel */
|
||||
float max_abs = 0.0f;
|
||||
for (unsigned i = 0; i < kDim; i++) {
|
||||
float absv = fabsf(float_weights[o * kDim + i]);
|
||||
if (absv > max_abs)
|
||||
max_abs = absv;
|
||||
}
|
||||
|
||||
/* Compute per-channel symmetric scale */
|
||||
if (max_abs > 0.0f) {
|
||||
per_ch_weight_scale[o] = max_abs / 127.0f;
|
||||
} else {
|
||||
per_ch_weight_scale[o] = 1.0f; /* avoid division by zero */
|
||||
}
|
||||
|
||||
/* Quantize to int8 — guaranteed no clamping needed */
|
||||
int32_t sum = 0;
|
||||
for (unsigned i = 0; i < kDim; i++) {
|
||||
int val = (int)roundf(float_weights[o * kDim + i] / per_ch_weight_scale[o]);
|
||||
/* Should always be in [-127, 127] but clamp defensively */
|
||||
if (val > 127) val = 127;
|
||||
if (val < -128) val = -128;
|
||||
sym_weights[o * kDim + i] = (int8_t)val;
|
||||
sum += sym_weights[o * kDim + i];
|
||||
}
|
||||
weight_sums[o] = sum;
|
||||
}
|
||||
free(float_weights);
|
||||
|
||||
if (DBG_ENABLED(THAMES_DBG_MSGS)) {
|
||||
DBG("Symmetric re-quantization: weight_zp=%d, orig_scale=%f\n",
|
||||
weight_zp, orig_weight_scale);
|
||||
DBG("Per-channel weight scales (first 4): %f %f %f %f\n",
|
||||
per_ch_weight_scale[0], per_ch_weight_scale[1],
|
||||
per_ch_weight_scale[2], per_ch_weight_scale[3]);
|
||||
DBG("First 8 symmetric weights: %d %d %d %d %d %d %d %d\n",
|
||||
sym_weights[0], sym_weights[1], sym_weights[2], sym_weights[3],
|
||||
sym_weights[4], sym_weights[5], sym_weights[6], sym_weights[7]);
|
||||
DBG("Weight sums (first 4 channels): %d %d %d %d\n",
|
||||
weight_sums[0], weight_sums[1], weight_sums[2], weight_sums[3]);
|
||||
}
|
||||
|
||||
/* Build params buffer: input/weight/bias/output IOVAs + conv_params + quant IOVAs */
|
||||
uint64_t main_params[6] = {
|
||||
thames_resource(subgraph->input_rsrc)->iova,
|
||||
thames_resource(subgraph->coefs_rsrc)->iova,
|
||||
subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
|
||||
thames_resource(subgraph->output_rsrc)->iova,
|
||||
0, /* Will be filled with conv_params IOVA below */
|
||||
0 /* Will be filled with quant IOVA below */
|
||||
};
|
||||
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
MAX2(weight_size, 1024));
|
||||
pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, sym_weights);
|
||||
free(sym_weights);
|
||||
|
||||
/* Create separate buffer for convolution parameters */
|
||||
/*
|
||||
* Bias: adjust to fold in input and output zero points.
|
||||
*
|
||||
* TI's TIDL formula (from tidl_import_quantize.cpp):
|
||||
* nScale[o] = S_y / (S_x * S_w[o])
|
||||
* finalBias[o] = originalBias[o] + (z_y * nScale[o] - z_x * Σ(weight_s8[o][i]))
|
||||
*
|
||||
* With per-channel symmetric re-quantization, S_w[o] = per_ch_weight_scale[o].
|
||||
*/
|
||||
subgraph->bias_rsrc = NULL;
|
||||
if (bias && bias->resource) {
|
||||
int32_t *bias_data = malloc(bias_size);
|
||||
pipe_buffer_read(pcontext, bias->resource, 0, bias_size, bias_data);
|
||||
|
||||
for (unsigned o = 0; o < out_c; o++) {
|
||||
/* nScale = S_y / (S_x * S_w[o]) — per-channel output-to-accumulator ratio */
|
||||
double nScale = (double)output->scale /
|
||||
((double)input->scale * (double)per_ch_weight_scale[o]);
|
||||
|
||||
double final_bias = (double)bias_data[o]
|
||||
+ (output_zp * nScale)
|
||||
- ((double)input_zp * weight_sums[o]);
|
||||
double abs_bias = final_bias < 0 ? -final_bias : final_bias;
|
||||
if (abs_bias > (double)2147483647) {
|
||||
mesa_logw("Thames: bias overflow on channel %u, zeroing weights", o);
|
||||
final_bias = final_bias / nScale;
|
||||
final_bias = final_bias * (double)output->scale / (double)input->scale;
|
||||
}
|
||||
bias_data[o] = (int32_t)round(final_bias);
|
||||
}
|
||||
|
||||
if (DBG_ENABLED(THAMES_DBG_MSGS)) {
|
||||
DBG("Adjusted bias values (int32): %d %d %d %d...\n",
|
||||
bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
|
||||
}
|
||||
|
||||
subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
MAX2(bias_size, 1024));
|
||||
pipe_buffer_write(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
|
||||
free(bias_data);
|
||||
}
|
||||
free(weight_sums);
|
||||
|
||||
/* Store input_zp for padFillValue in the DSP kernel */
|
||||
subgraph->conv_input_zp = input_zp;
|
||||
subgraph->conv_output_zp = output_zp;
|
||||
|
||||
/*
|
||||
* Quantization parameters for MMALib — per-channel.
|
||||
*
|
||||
* MMALib uses per-channel uint8 scale and uint8 shift for output requantization:
|
||||
* output[c] = clamp((acc[c] * scale[c] + rounding) >> shift[c], min, max)
|
||||
*
|
||||
* Per-channel scale ratio: scaleRatio[c] = S_y / (S_x * S_w[c])
|
||||
* We find uint8 scale and uint8 shift per channel such that:
|
||||
* scale / 2^shift ≈ scaleRatio[c]
|
||||
*
|
||||
* This follows TI's TIDL_getMMAv2_ScaleShiftAndError() algorithm.
|
||||
*/
|
||||
size_t quant_buf_size = 2 * out_c; /* [scale_0..scale_N-1][shift_0..shift_N-1] */
|
||||
uint8_t *quant_data = calloc(1, quant_buf_size);
|
||||
|
||||
for (unsigned ch = 0; ch < out_c; ch++) {
|
||||
double scale_ratio = (double)output->scale /
|
||||
((double)input->scale * (double)per_ch_weight_scale[ch]);
|
||||
|
||||
/*
|
||||
* Find best (scale, shift) pair using TI's brute-force approach:
|
||||
* For each possible scale value (1..255), compute the optimal shift as
|
||||
* shift = round(ln(scale / scaleRatio) / ln(2))
|
||||
* then check the approximation error |scaleRatio - scale/2^shift|.
|
||||
*/
|
||||
uint8_t best_shift = 0;
|
||||
uint8_t best_scale = 1;
|
||||
double min_error = 1e30;
|
||||
for (int s_iter = 1; s_iter <= 255; s_iter++) {
|
||||
int shift_bits = (int)round(log((double)s_iter / scale_ratio) / log(2.0));
|
||||
if (shift_bits > 40) shift_bits = 40;
|
||||
if (shift_bits < 0) shift_bits = 0;
|
||||
double approx = (double)s_iter / pow(2.0, shift_bits);
|
||||
double err = fabs(scale_ratio - approx);
|
||||
if (err < min_error) {
|
||||
min_error = err;
|
||||
best_shift = (uint8_t)shift_bits;
|
||||
best_scale = (uint8_t)s_iter;
|
||||
}
|
||||
}
|
||||
|
||||
quant_data[ch] = best_scale;
|
||||
quant_data[out_c + ch] = best_shift;
|
||||
|
||||
if (ch < 4 && DBG_ENABLED(THAMES_DBG_MSGS)) {
|
||||
DBG("Channel %u: weight_scale=%f, scale_ratio=%f -> scale=%u, shift=%u (eff=%f, err=%e)\n",
|
||||
ch, per_ch_weight_scale[ch], scale_ratio,
|
||||
best_scale, best_shift, (double)best_scale / pow(2.0, best_shift), min_error);
|
||||
}
|
||||
}
|
||||
free(per_ch_weight_scale);
|
||||
subgraph->quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
MAX2(quant_buf_size, 64));
|
||||
pipe_buffer_write(pcontext, subgraph->quant_rsrc, 0, quant_buf_size, quant_data);
|
||||
free(quant_data);
|
||||
|
||||
/*
|
||||
* Convolution parameters buffer.
|
||||
* Layout must match what the DSP kernel expects (see test_kernel.c header).
|
||||
*/
|
||||
uint32_t conv_params[15] = {
|
||||
in_h, in_w, in_c,
|
||||
k_h, k_w,
|
||||
out_h, out_w, out_c,
|
||||
stride_h, stride_w,
|
||||
pad_h, pad_w,
|
||||
(uint32_t)input->zero_point,
|
||||
(uint32_t)weights->zero_point,
|
||||
(uint32_t)output->zero_point
|
||||
pad_top, pad_bottom, pad_left, pad_right,
|
||||
(uint32_t)input_zp, /* input zero point for padFillValue */
|
||||
};
|
||||
|
||||
/* Compute fixed-point multiplier and shift for requantization.
|
||||
* Use TFLite's QuantizeMultiplier approach: shift clamped to [0, 31],
|
||||
* multiplier adjusted to fit.
|
||||
subgraph->conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
sizeof(conv_params));
|
||||
pipe_buffer_write(pcontext, subgraph->conv_params_rsrc, 0,
|
||||
sizeof(conv_params), conv_params);
|
||||
|
||||
DBG("Conv params: in=%ux%ux%u k=%ux%u out=%ux%ux%u stride=%u,%u pad=%u/%u/%u/%u\n",
|
||||
in_h, in_w, in_c, k_h, k_w, out_h, out_w, out_c,
|
||||
stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right);
|
||||
DBG("CHW layout: inChOffset=%u pitchC=%u input_chw_size=%zu output_chw_size=%zu\n",
|
||||
inChOffset, pitchC, input_chw_size, output_chw_size);
|
||||
|
||||
/*
|
||||
* Build main params buffer: 6 IOVAs that args[0..5] will point to.
|
||||
*/
|
||||
double effective_scale = (double)(input->scale * weights->scale) / (double)output->scale;
|
||||
|
||||
int exponent;
|
||||
double significand = frexp(effective_scale, &exponent); /* significand in [0.5, 1.0) */
|
||||
|
||||
/* Compute target shift: 31 - exponent */
|
||||
int target_shift = 31 - exponent;
|
||||
uint32_t shift;
|
||||
int32_t multiplier;
|
||||
|
||||
if (target_shift < 0) {
|
||||
/* Scale >= 1.0, use shift=0 and scale down multiplier */
|
||||
shift = 0;
|
||||
multiplier = (int32_t)round(effective_scale * (1LL << 31));
|
||||
} else if (target_shift > 31) {
|
||||
/* Scale very small, clamp shift to 31 and scale down multiplier */
|
||||
shift = 31;
|
||||
/* multiplier = significand * 2^31 * 2^(31 - target_shift) */
|
||||
double scaled_sig = significand * exp2(31 - target_shift);
|
||||
multiplier = (int32_t)round(scaled_sig * (1LL << 31));
|
||||
} else {
|
||||
shift = target_shift;
|
||||
multiplier = (int32_t)round(significand * (1LL << 31));
|
||||
}
|
||||
|
||||
DBG("Quantization: effective_scale=%f, multiplier=%d (0x%x), shift=%u\n",
|
||||
effective_scale, multiplier, multiplier, shift);
|
||||
DBG("Input scale=%f, weight scale=%f, output scale=%f\n",
|
||||
input->scale, weights->scale, output->scale);
|
||||
DBG("Conv params being sent: input_zp=%u, weight_zp=%u, output_zp=%u\n",
|
||||
(uint32_t)input->zero_point, (uint32_t)weights->zero_point, (uint32_t)output->zero_point);
|
||||
DBG("Multiplier=%d (0x%x), shift=%u, output_zp=%u\n", multiplier, multiplier, shift, (uint32_t)output->zero_point);
|
||||
|
||||
/* Create buffer for quantization params (multiplier, shift) */
|
||||
int32_t quant_params[2] = {
|
||||
multiplier,
|
||||
(int32_t)shift
|
||||
uint64_t main_params[6] = {
|
||||
thames_resource(subgraph->input_rsrc)->iova,
|
||||
thames_resource(subgraph->coefs_rsrc)->iova,
|
||||
subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
|
||||
thames_resource(subgraph->output_rsrc)->iova,
|
||||
thames_resource(subgraph->conv_params_rsrc)->iova,
|
||||
thames_resource(subgraph->quant_rsrc)->iova,
|
||||
};
|
||||
|
||||
struct pipe_resource *conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(conv_params));
|
||||
pipe_buffer_write(pcontext, conv_params_rsrc, 0, sizeof(conv_params), conv_params);
|
||||
main_params[4] = thames_resource(conv_params_rsrc)->iova;
|
||||
|
||||
struct pipe_resource *quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(quant_params));
|
||||
pipe_buffer_write(pcontext, quant_rsrc, 0, sizeof(quant_params), quant_params);
|
||||
main_params[5] = thames_resource(quant_rsrc)->iova;
|
||||
|
||||
subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(main_params));
|
||||
pipe_buffer_write(pcontext, subgraph->params_rsrc, 0, sizeof(main_params), main_params);
|
||||
|
||||
pipe_resource_reference(&conv_params_rsrc, NULL);
|
||||
pipe_resource_reference(&quant_rsrc, NULL);
|
||||
subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
|
||||
sizeof(main_params));
|
||||
pipe_buffer_write(pcontext, subgraph->params_rsrc, 0,
|
||||
sizeof(main_params), main_params);
|
||||
|
||||
return &subgraph->base;
|
||||
}
|
||||
|
|
@ -441,21 +595,50 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
|
|||
struct timespec start, end;
|
||||
int ret;
|
||||
|
||||
unsigned in_h = subgraph->conv_in_h;
|
||||
unsigned in_w = subgraph->conv_in_w;
|
||||
unsigned in_c = subgraph->conv_in_c;
|
||||
unsigned inChOffset = subgraph->conv_inChOffset;
|
||||
|
||||
/*
|
||||
* Transpose input from NHWC to CHW for MMALib.
|
||||
*
|
||||
* Source (NHWC): [H][W][C] contiguous, size = H * W * C
|
||||
* Destination (CHW): [C][inChOffset] where inChOffset = ALIGN(H*W, 64)
|
||||
*
|
||||
* Each channel plane has H*W valid pixels followed by padding zeros.
|
||||
*/
|
||||
size_t input_chw_size = (size_t)in_c * inChOffset;
|
||||
uint8_t *chw_buf = calloc(1, input_chw_size); /* calloc zeros the padding */
|
||||
|
||||
for (unsigned i = 0; i < inputs_count; i++) {
|
||||
struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]);
|
||||
assert(input);
|
||||
const uint8_t *nhwc = (const uint8_t *)inputs[i];
|
||||
|
||||
for (unsigned c = 0; c < in_c; c++) {
|
||||
for (unsigned h = 0; h < in_h; h++) {
|
||||
for (unsigned w = 0; w < in_w; w++) {
|
||||
unsigned nhwc_idx = (h * in_w + w) * in_c + c;
|
||||
unsigned chw_idx = c * inChOffset + h * in_w + w;
|
||||
chw_buf[chw_idx] = nhwc[nhwc_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct thames_tensor *input_tensor = thames_find_tensor(subgraph, input_idxs[i]);
|
||||
assert(input_tensor);
|
||||
|
||||
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
|
||||
thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
|
||||
thames_dump_buffer(inputs[i], "input-nhwc", 0, 0, 0, in_h * in_w * in_c);
|
||||
|
||||
pipe_buffer_write(pcontext, subgraph->input_rsrc, input->offset, input->size, inputs[i]);
|
||||
pipe_buffer_write(pcontext, subgraph->input_rsrc, 0, input_chw_size, chw_buf);
|
||||
}
|
||||
free(chw_buf);
|
||||
|
||||
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
|
||||
struct pipe_transfer *transfer_in;
|
||||
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->input_rsrc,
|
||||
PIPE_MAP_READ, &transfer_in);
|
||||
thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->input_rsrc));
|
||||
thames_dump_buffer(buf, "input-chw", 0, 0, 0, input_chw_size);
|
||||
pipe_buffer_unmap(subgraph->base.context, transfer_in);
|
||||
}
|
||||
|
||||
|
|
@ -465,13 +648,20 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
|
|||
job.params = thames_resource(subgraph->params_rsrc)->handle;
|
||||
job.params_size = pipe_buffer_size(subgraph->params_rsrc);
|
||||
|
||||
/* Pass both input buffers (input_a and input_b) as input BOs */
|
||||
uint32_t in_bo_handles[2] = {
|
||||
thames_resource(subgraph->input_rsrc)->handle,
|
||||
thames_resource(subgraph->coefs_rsrc)->handle,
|
||||
};
|
||||
/*
|
||||
* Pass all input BOs the kernel will access.
|
||||
* The DRM driver needs to know about them for cache management.
|
||||
*/
|
||||
uint32_t in_bo_handles[4];
|
||||
unsigned in_bo_count = 0;
|
||||
in_bo_handles[in_bo_count++] = thames_resource(subgraph->input_rsrc)->handle;
|
||||
in_bo_handles[in_bo_count++] = thames_resource(subgraph->coefs_rsrc)->handle;
|
||||
if (subgraph->bias_rsrc)
|
||||
in_bo_handles[in_bo_count++] = thames_resource(subgraph->bias_rsrc)->handle;
|
||||
in_bo_handles[in_bo_count++] = thames_resource(subgraph->quant_rsrc)->handle;
|
||||
|
||||
job.in_bo_handles = (uintptr_t)in_bo_handles;
|
||||
job.in_bo_handle_count = 2;
|
||||
job.in_bo_handle_count = in_bo_count;
|
||||
|
||||
job.out_bo_handles = (uintptr_t)&thames_resource(subgraph->output_rsrc)->handle;
|
||||
job.out_bo_handle_count = 1;
|
||||
|
|
@ -499,11 +689,12 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
|
|||
DBG("Input hex: %s\n", hexbuf);
|
||||
pipe_buffer_unmap(subgraph->base.context, transfer_in);
|
||||
|
||||
uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_in);
|
||||
struct pipe_transfer *transfer_coefs;
|
||||
uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_coefs);
|
||||
for (int i = 0; i < 32; i++)
|
||||
snprintf(hexbuf + i * 3, 4, "%02x ", coefsbuf[i]);
|
||||
DBG("Coefs hex: %s\n", hexbuf);
|
||||
pipe_buffer_unmap(subgraph->base.context, transfer_in);
|
||||
pipe_buffer_unmap(subgraph->base.context, transfer_coefs);
|
||||
|
||||
/* Read output buffer */
|
||||
struct pipe_transfer *transfer_out;
|
||||
|
|
@ -531,21 +722,44 @@ thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
|
|||
struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
|
||||
uint8_t **outputs = (uint8_t **)outputsv;
|
||||
|
||||
for (int i = 0; i < outputs_count; i++) {
|
||||
struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]);
|
||||
unsigned out_h = subgraph->conv_out_h;
|
||||
unsigned out_w = subgraph->conv_out_w;
|
||||
unsigned out_c = subgraph->conv_out_c;
|
||||
unsigned pitchC = subgraph->conv_pitchC;
|
||||
size_t output_chw_size = (size_t)out_c * pitchC;
|
||||
|
||||
for (int i = 0; i < outputs_count; i++) {
|
||||
if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
|
||||
struct pipe_transfer *transfer_in;
|
||||
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->output_rsrc,
|
||||
PIPE_MAP_READ, &transfer_in);
|
||||
thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->output_rsrc));
|
||||
thames_dump_buffer(buf, "output-chw", 0, 0, 0, output_chw_size);
|
||||
pipe_buffer_unmap(subgraph->base.context, transfer_in);
|
||||
}
|
||||
|
||||
/* For test kernel phase: output is at offset 0, size is the buffer size */
|
||||
unsigned size = pipe_buffer_size(subgraph->output_rsrc);
|
||||
DBG("Reading output %u from offset 0, size %u\n", output_idxs[i], size);
|
||||
pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, size, outputs[i]);
|
||||
/*
|
||||
* Read CHW output from DSP and transpose to NHWC for TFLite.
|
||||
*
|
||||
* Source (CHW): [out_c][pitchC] where each channel has out_h*out_w valid pixels
|
||||
* Destination (NHWC): [out_h][out_w][out_c] contiguous
|
||||
*/
|
||||
uint8_t *chw_out = malloc(output_chw_size);
|
||||
pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, output_chw_size, chw_out);
|
||||
|
||||
for (unsigned c = 0; c < out_c; c++) {
|
||||
for (unsigned h = 0; h < out_h; h++) {
|
||||
for (unsigned w = 0; w < out_w; w++) {
|
||||
unsigned chw_idx = c * pitchC + h * out_w + w;
|
||||
unsigned nhwc_idx = (h * out_w + w) * out_c + c;
|
||||
outputs[i][nhwc_idx] = chw_out[chw_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(chw_out);
|
||||
|
||||
DBG("Reading output %u: CHW %ux%u -> NHWC %ux%ux%u (%u bytes)\n",
|
||||
output_idxs[i], out_c, pitchC, out_h, out_w, out_c, out_h * out_w * out_c);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -561,6 +775,8 @@ thames_ml_subgraph_destroy(struct pipe_context *pcontext,
|
|||
pipe_resource_reference(&subgraph->bias_rsrc, NULL);
|
||||
pipe_resource_reference(&subgraph->kernel_rsrc, NULL);
|
||||
pipe_resource_reference(&subgraph->params_rsrc, NULL);
|
||||
pipe_resource_reference(&subgraph->quant_rsrc, NULL);
|
||||
pipe_resource_reference(&subgraph->conv_params_rsrc, NULL);
|
||||
|
||||
util_dynarray_fini(&subgraph->operations);
|
||||
util_dynarray_fini(&subgraph->tensors);
|
||||
|
|
|
|||
|
|
@ -186,8 +186,18 @@ struct thames_subgraph {
|
|||
uint8_t *coefs;
|
||||
struct pipe_resource *coefs_rsrc;
|
||||
unsigned coefs_used;
|
||||
|
||||
|
||||
struct pipe_resource *bias_rsrc; /* Bias tensor (int32) */
|
||||
struct pipe_resource *quant_rsrc; /* Scale + shift (uint8 packed) */
|
||||
struct pipe_resource *conv_params_rsrc; /* Convolution parameters */
|
||||
|
||||
/* Convolution dimensions (needed for NHWC<->CHW transpose at invoke/read) */
|
||||
unsigned conv_in_h, conv_in_w, conv_in_c;
|
||||
unsigned conv_out_h, conv_out_w, conv_out_c;
|
||||
unsigned conv_inChOffset; /* ALIGN(in_h*in_w + 64, 64) */
|
||||
unsigned conv_pitchC; /* ALIGN(out_h*out_w + 64, 64) */
|
||||
int conv_input_zp; /* Input zero point (for padFillValue) */
|
||||
int conv_output_zp; /* Output zero point (post-add) */
|
||||
};
|
||||
|
||||
bool
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue