Use MMALib

2026-03-12 13:20:33 +01:00 · 2026-02-13 10:15:50 +01:00 · 2026-02-13 10:15:50 +01:00 · cbc347c3ee
commit cbc347c3ee
parent b8319435ef
5 changed files with 29426 additions and 329 deletions
--- a/compile-kernel.sh
+++ b/compile-kernel.sh
@ -27,6 +27,10 @@ scp "${KERNEL_SRC}" "${BOARD_USER}@${BOARD_IP}:/tmp/test_kernel.c"
 # Create linker command file on board to ensure proper section placement
 echo "[1.5/5] Creating linker.cmd on board..."
 ssh "${BOARD_USER}@${BOARD_IP}" "cat > /tmp/linker.cmd <<EOF
+MEMORY
+{
+    L2SRAM (RWX) : origin = 0x0, length = 0x200000
+}
 SECTIONS
 {
    .text : {
@ -34,7 +38,10 @@ SECTIONS
        *(.text)
        *(.const)
        *(.switch)
-    }
+        *(.kernel_data)
+        *(.data)
+        *(.bss)
+    } > L2SRAM
 }
 EOF"

@ -50,6 +57,8 @@ ssh "${BOARD_USER}@${BOARD_IP}" "cd /tmp && ${CL7X_PATH} -mv7524 --abi=eabi -O0
  --reread_libs --warn_sections \
  --rom_model \
  -l${MMALIB_LIB} \
+  -l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/mmalib_cn_C7524.lib \
+  -l/home/tomeu/src/ti-processor-sdk-rtos-j722s-evm-09_02_00_05/mmalib_09_02_00_08/lib/C7524/release/common_C7524.lib \
  -lrts7524_le.lib \
  linker.cmd \
  -o test_kernel.out"
--- a/src/gallium/drivers/thames/test_kernel.c
+++ b/src/gallium/drivers/thames/test_kernel.c
@ -4,11 +4,20 @@
 * This kernel uses TI's MMALib to perform quantized int8 convolution
 * accelerated by the C7x Matrix Multiply Accelerator (MMA) hardware.
 *
+ * Data layout contract with Mesa (CPU) side:
+ *   - Input arrives in CHW layout (transposed from NHWC by Mesa)
+ *     Layout: [numInChannels][inChOffset] where inChOffset = ALIGN(H*W, 64)
+ *   - Weights arrive in OHWI layout (TFLite native, no transpose needed)
+ *     Layout: [numOutChannels][kH * kW * numInChannels]
+ *   - Bias: [numOutChannels] int32 values
+ *   - Output produced in CHW layout (Mesa transposes back to NHWC)
+ *     Layout: [numOutChannels][pitchC] where pitchC = ALIGN(outH*outW, 64)
+ *
 * Args:
- *   args[0] = input tensor (uint8, NHWC layout)
- *   args[1] = weight tensor (uint8, HWIO layout)
+ *   args[0] = input tensor (uint8, CHW layout, padded to inChOffset per channel)
+ *   args[1] = weight tensor (int8, OHWI layout = [Cout][Cin*Fr*Fc])
 *   args[2] = bias tensor (int32, per output channel)
- *   args[3] = output tensor (uint8, NHWC layout)
+ *   args[3] = output tensor (uint8, CHW layout, pitchC per channel)
 *   args[4] = params struct pointer (uint32):
 *       params[0] = input_height
 *       params[1] = input_width
@ -20,14 +29,14 @@
 *       params[7] = output_channels
 *       params[8] = stride_h
 *       params[9] = stride_w
- *       params[10] = pad_h
- *       params[11] = pad_w
- *       params[12] = input_zero_point (unused - MMALib handles this internally)
- *       params[13] = weight_zero_point (unused - MMALib handles this internally)
- *       params[14] = output_zero_point (unused - MMALib handles this internally)
- *   args[5] = quantization params (int32):
- *       quant[0] = multiplier (int32)
- *       quant[1] = shift (int32)
+ *       params[10] = pad_top
+ *       params[11] = pad_bottom
+ *       params[12] = pad_left
+ *       params[13] = pad_right
+ *       params[14] = input_zero_point (for padFillValue)
+ *   args[5] = quantization params (uint8 packed):
+ *       quant[0..out_c-1]          = scale values (uint8)
+ *       quant[out_c..2*out_c-1]    = shift values (uint8)
 *
 * Compile with:
 *   See compile-kernel.sh for build procedure
@ -38,68 +47,281 @@
 /* MMALib headers - MMA hardware acceleration library */
 #include "mmalib.h"

+/* Align a value up to the next multiple of 'align' (align must be power of 2) */
+#define ALIGN_UP(val, align) (((val) + (align) - 1) & ~((align) - 1))
+
 /*
- * Quantized int8 convolution kernel using C7x MMA hardware via MMALib
+ * Quantized convolution kernel using C7x MMA hardware via MMALib
 *
- * This is the entry point called from the DRM driver.
- * Note: This code runs bare-metal on the DSP, so no stdlib functions.
 * Placed in .text.entry section to ensure it's at the beginning of the binary.
 */
+
+/* Static buffers for handle and packed weights */
+/* Placed in .kernel_data section so they are uploaded with the kernel code */
+#pragma DATA_ALIGN(handle_buffer, 128)
+__attribute__((section(".kernel_data")))
+static uint8_t handle_buffer[16384];
+
+/* Weights padded to pitchA stride between output channels */
+#pragma DATA_ALIGN(weights_padded, 128)
+__attribute__((section(".kernel_data")))
+static int8_t weights_padded[65536];
+
+/* Weights after MMALib reorder for MMA-friendly layout */
+#pragma DATA_ALIGN(weights_reordered, 128)
+__attribute__((section(".kernel_data")))
+static int8_t weights_reordered[65536];
+
+/* Per-channel scale (uint8) and shift (uint8) for requantization */
+#pragma DATA_ALIGN(scale_buf, 128)
+__attribute__((section(".kernel_data")))
+static uint8_t scale_buf[2048];
+
+#pragma DATA_ALIGN(shift_buf, 128)
+__attribute__((section(".kernel_data")))
+static uint8_t shift_buf[2048];
+
+
 #pragma RETAIN(test_kernel)
 __attribute__((section(".text.entry")))
 int
 test_kernel(unsigned long long *args)
 {
-    unsigned char *input = (unsigned char *)args[0];
-    unsigned char *weights = (unsigned char *)args[1];
-    int *bias = (int *)args[2];
-    unsigned char *output = (unsigned char *)args[3];
-    unsigned int *params = (unsigned int *)args[4];
-    int *quant = (int *)args[5];
+   unsigned char *input = (unsigned char *)args[0];
+   signed char *weights = (signed char *)args[1];
+   int *bias = (int *)args[2];
+   unsigned char *output = (unsigned char *)args[3];
+   unsigned int *params = (unsigned int *)args[4];
+   unsigned char *quant = (unsigned char *)args[5];

-    /* Extract convolution parameters */
-    unsigned int in_h = params[0];
-    unsigned int in_w = params[1];
-    unsigned int in_c = params[2];
-    unsigned int k_h = params[3];
-    unsigned int k_w = params[4];
-    unsigned int out_h = params[5];
-    unsigned int out_w = params[6];
-    unsigned int out_c = params[7];
-    unsigned int stride_h = params[8];
-    unsigned int stride_w = params[9];
-    int pad_h = (int)params[10];
-    int pad_w = (int)params[11];
-    
-    /* Extract quantization parameters */
-    int multiplier = quant[0];
-    int shift = quant[1];
-    
-    /* Basic validation */
-    if (!input || !weights || !bias || !output || !params || !quant) {
-        return -1;
-    }
-    
-    if (in_w == 0 || in_h == 0 || in_c == 0 ||
-        k_w == 0 || k_h == 0 || out_c == 0 ||
-        stride_w == 0 || stride_h == 0) {
-        return -1;
-    }
-    
-    /* TODO: Call MMALib convolution function
-     * 
-     * This requires:
-     * 1. Allocating a kernel handle
-     * 2. Setting up MMALIB_bufParams structures
-     * 3. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init()
-     * 4. Calling MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec()
-     * 
-     * For now, return success to test static linking of mmalib_C7524.lib
-     */
-    
-    return 0;  /* Success */
+   /* Extract convolution parameters */
+   unsigned int in_h = params[0];
+   unsigned int in_w = params[1];
+   unsigned int in_c = params[2];
+   unsigned int k_h = params[3];
+   unsigned int k_w = params[4];
+   unsigned int out_h = params[5];
+   unsigned int out_w = params[6];
+   unsigned int out_c = params[7];
+   unsigned int stride_h = params[8];
+   unsigned int stride_w = params[9];
+   int pad_top = (int)params[10];
+   int pad_bottom = (int)params[11];
+   int pad_left = (int)params[12];
+   int pad_right = (int)params[13];
+   int input_zp = (int)params[14];
+
+   /* Basic validation */
+   if (!input || !weights || !bias || !output || !params || !quant)
+      return -1;
+
+   if (in_w == 0 || in_h == 0 || in_c == 0 ||
+       k_w == 0 || k_h == 0 || out_c == 0 ||
+       stride_w == 0 || stride_h == 0)
+      return -1;
+
+   /*
+    * Derived parameters for MMALib (following test case 1005 pattern)
+    *
+    * MMALib convolveBias_row operates on channel-first data:
+    *   A matrix (src0) = weights: [numOutChannels][kDim] with stride pitchA
+    *   B matrix (src1) = input:   [numInChannels][inChOffset]
+    *   C matrix (dst)  = output:  [numOutChannels][pitchC]
+    */
+   int kDim = k_h * k_w * in_c;            /* elements per filter */
+   int pitchA = ALIGN_UP(kDim, 64);        /* 64-byte aligned weight row stride */
+   int spatial = in_w * in_h;              /* total spatial locations per channel */
+   int inChOffset = ALIGN_UP(spatial + 64, 64); /* channel stride with extra padding (test 1005 pattern) */
+   int outSpatial = out_w * out_h;
+   int pitchC = ALIGN_UP(outSpatial + 64, 64); /* output channel stride with extra padding */
+
+   /*
+    * Copy per-channel scale and shift from the quant buffer.
+    * Mesa packs them as: [scale0..scaleN-1][shift0..shiftN-1], both uint8.
+    */
+   if (out_c > 2048)
+      return -2;
+   for (unsigned int i = 0; i < out_c; i++) {
+      scale_buf[i] = quant[i];
+      shift_buf[i] = quant[out_c + i];
+   }
+
+   /*
+    * Pad weights from OHWI [out_c][kDim] contiguous to [out_c][pitchA] with
+    * zero-padding between rows for 64-byte alignment.
+    */
+   int weight_padded_size = out_c * pitchA;
+   if (weight_padded_size > (int)sizeof(weights_padded))
+      return -2;
+
+   for (unsigned int o = 0; o < out_c; o++) {
+      for (int j = 0; j < kDim; j++)
+         weights_padded[o * pitchA + j] = weights[o * kDim + j];
+      for (int j = kDim; j < pitchA; j++)
+         weights_padded[o * pitchA + j] = 0;
+   }
+
+   /*
+    * Weight reorder: rearrange from natural OHWI to MMA-friendly tiled layout.
+    * Must happen before init.
+    */
+   MMALIB_CNN_convolveBias_row_processWeights_Args reorderArgs;
+   reorderArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
+   reorderArgs.data_type = MMALIB_UINT8;
+   reorderArgs.Fr = k_h;
+   reorderArgs.Fc = k_w;
+   reorderArgs.pitchA = pitchA;
+   reorderArgs.numInChPerGroup = in_c;
+   reorderArgs.subMChannels = out_c;
+   reorderArgs.No = out_c;
+   reorderArgs.numGroupsPerKernel = 1;
+   reorderArgs.packetizeMode = 1;
+
+   int32_t reorderSize = MMALIB_CNN_convolveBias_row_processWeights_getMemorySize(
+      &reorderArgs, weights_padded);
+   if (reorderSize > (int)sizeof(weights_reordered))
+      return -4;
+
+   MMALIB_STATUS status = MMALIB_CNN_convolveBias_row_processWeights_reorder(
+      &reorderArgs, weights_padded, weights_reordered);
+   if (status != MMALIB_SUCCESS)
+      return 3000 + (int)status;
+
+   /* ---- MMALib structures ---- */
+   MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_InitArgs initArgs;
+   MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecInArgs execInArgs;
+   MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_ExecOutArgs execOutArgs;
+
+   MMALIB_bufParams2D_t src0_addr; /* Weights (A matrix) */
+   MMALIB_bufParams2D_t src1_addr; /* Input feature map (B matrix) */
+   MMALIB_bufParams2D_t src2_addr; /* Bias */
+   MMALIB_bufParams1D_t src3_addr; /* Scale */
+   MMALIB_bufParams3D_t dst_addr;  /* Output feature map (C matrix) */
+
+   /*
+    * InitArgs -- following MMALib test case 1005 conventions:
+    *   Fc = kernel width (spatial), NOT multiplied by in_c
+    *   Fr = kernel height
+    *   strideX/Y = spatial stride, NOT multiplied by in_c
+    *   inWidth = spatial width of input feature map
+    *   maxHeight = spatial height of input feature map
+    *   inChOffset = pitch between input channel planes (>= spatial, 64-aligned)
+    *   validColsIn = total valid spatial locations to process
+    */
+   initArgs.funcStyle = MMALIB_FUNCTION_OPTIMIZED;
+   initArgs.Fc = k_w;
+   initArgs.Fr = k_h;
+   initArgs.strideX = stride_w;
+   initArgs.strideY = stride_h;
+   initArgs.dilationX = 1;
+   initArgs.dilationY = 1;
+   initArgs.inWidth = in_w;
+   initArgs.maxHeight = in_h;
+   initArgs.inChOffset = inChOffset;
+   initArgs.validColsIn = spatial; /* process all spatial locations */
+   initArgs.validColsPerRowIn = 0; /* 0 for LINEAR stride-1 */
+   initArgs.validRowsIn = 0;       /* 0 for LINEAR stride-1 */
+   initArgs.inputPitchPerRow = 0;  /* 0 for LINEAR stride-1 */
+   initArgs.outputPitchPerRow = 0; /* 0 for LINEAR stride-1 */
+   initArgs.No = out_c;
+   initArgs.subMChannels = out_c;  /* process all output channels at once */
+   initArgs.numGroupsPerKernel = 1;
+   initArgs.bias = 1; /* bias enabled */
+   initArgs.activationType = MMALIB_RELU; /* ReLU clamps negative to 0 */
+   initArgs.pSatMin = 0;
+   initArgs.pSatMax = 0; /* 0 = use default saturation for data type */
+   initArgs.mode = MMALIB_LINEAR;
+   initArgs.col = 0;
+   initArgs.pad = 0;
+   initArgs.padTop = pad_top;
+   initArgs.padBottom = pad_bottom;
+   initArgs.padLeft = pad_left;
+   initArgs.padRight = pad_right;
+   initArgs.validColsOutBottom = 0;
+   initArgs.packetizeMode = 1; /* weights are reordered/packetized */
+
+   /*
+    * Buffer parameter structures
+    *
+    * src0 (weights): 2D [numOutChannels][kDim] with pitchA stride
+    * src1 (input):   2D [numInChannels][inChOffset]
+    * src2 (bias):    2D [1][numOutChannels] (int32 per output channel)
+    * src3 (scale):   1D [numOutChannels] (uint8 per channel)
+    * dst  (output):  3D [numGroupsPerKernel][numOutChannels][pitchC]
+    */
+   src0_addr.data_type = MMALIB_INT8;
+   src0_addr.dim_x = kDim;
+   src0_addr.dim_y = out_c;
+   src0_addr.stride_y = pitchA;
+
+   src1_addr.data_type = MMALIB_UINT8;
+   src1_addr.dim_x = inChOffset;
+   src1_addr.dim_y = in_c;
+   src1_addr.stride_y = inChOffset; /* * sizeof(uint8) = inChOffset bytes */
+
+   src2_addr.data_type = MMALIB_INT32;
+   src2_addr.dim_x = out_c;
+   src2_addr.dim_y = 1;
+   src2_addr.stride_y = out_c; /* in elements, not bytes — MMALib uses data_type for byte stride */
+
+   src3_addr.data_type = MMALIB_UINT8;
+   src3_addr.dim_x = out_c;
+
+   dst_addr.data_type = MMALIB_UINT8;
+   dst_addr.dim_x = pitchC;
+   dst_addr.dim_y = out_c;
+   dst_addr.stride_y = pitchC; /* bytes (uint8) */
+   dst_addr.dim_z = 1;
+   dst_addr.stride_z = out_c * pitchC;
+
+   /* Get Handle Size */
+   int32_t handleSize = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_getHandleSize(&initArgs);
+   if (handleSize > (int)sizeof(handle_buffer))
+      return -3;
+
+   /* Check params before init */
+   status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init_checkParams(
+      handle_buffer,
+      &src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
+      &initArgs);
+   if (status != MMALIB_SUCCESS)
+      return 4000 + (int)status;
+
+   /* Init kernel handle */
+   status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_init(
+      handle_buffer,
+      &src0_addr, &src1_addr, &src2_addr, &src3_addr, &dst_addr,
+      &initArgs);
+   if (status != MMALIB_SUCCESS)
+      return 1000 + (int)status;
+
+   /* Fill ExecInArgs */
+   execInArgs.validColsIn = spatial;
+   execInArgs.validColsPerRowIn = 0;
+   execInArgs.validRowsIn = 0;
+   execInArgs.col = 0;
+   execInArgs.subMChannels = out_c;
+   execInArgs.quantMethod = 1; /* MMALIB_QM_PER_CHANNEL */
+   execInArgs.padFillValue = input_zp; /* padding fill = input zero point */
+   execInArgs.enableDynamicRange = 0;
+   execInArgs.initDynamicRange = 0;
+
+   /* Execute convolution */
+   status = MMALIB_CNN_convolveBias_row_ixX_ixX_oxX_exec(
+      handle_buffer,
+      weights_reordered,
+      input,
+      bias,
+      scale_buf,
+      shift_buf,
+      output,
+      &execInArgs,
+      &execOutArgs);
+   if (status != MMALIB_SUCCESS)
+      return 2000 + (int)status;
+
+   return 0;
 }

 /* Dummy main to satisfy linker/RTS requirements (we don't run _c_int00) */
 int main(void) { return 0; }
-
--- a/src/gallium/drivers/thames/thames_kernel_bin.h
+++ b/src/gallium/drivers/thames/thames_kernel_bin.h
--- a/src/gallium/drivers/thames/thames_ml.c
+++ b/src/gallium/drivers/thames/thames_ml.c
@ -265,165 +265,319 @@ thames_ml_subgraph_create(struct pipe_context *pcontext,
       input->zero_point, weights->zero_point, bias ? bias->zero_point : 0, output->zero_point);

   /* Input dimensions: NHWC */
-   unsigned int in_h = input->dims[1];
-   unsigned int in_w = input->dims[2];
-   unsigned int in_c = input->dims[3];
+   unsigned in_h = input->dims[1];
+   unsigned in_w = input->dims[2];
+   unsigned in_c = input->dims[3];

   /* Weight dimensions: TFLite uses OHWI format (output, height, width, input) */
-   unsigned int out_c = weights->dims[0];
-   unsigned int k_h = weights->dims[1];
-   unsigned int k_w = weights->dims[2];
-   unsigned int k_in_c = weights->dims[3];
+   unsigned out_c = weights->dims[0];
+   unsigned k_h = weights->dims[1];
+   unsigned k_w = weights->dims[2];
+   unsigned k_in_c = weights->dims[3];

   /* Output dimensions: NHWC */
-   unsigned int out_h = output->dims[1];
-   unsigned int out_w = output->dims[2];
+   unsigned out_h = output->dims[1];
+   unsigned out_w = output->dims[2];

-   unsigned int stride_h = conv_op->conv.stride_y;
-   unsigned int stride_w = conv_op->conv.stride_x;
+   unsigned stride_h = conv_op->conv.stride_y;
+   unsigned stride_w = conv_op->conv.stride_x;

   /* Calculate padding from input/output/kernel dimensions */
-   unsigned int pad_h, pad_w;
+   unsigned pad_top, pad_bottom, pad_left, pad_right;
   if (conv_op->conv.padding_same) {
-      pad_h = ((out_h - 1) * stride_h + k_h - in_h) / 2;
-      pad_w = ((out_w - 1) * stride_w + k_w - in_w) / 2;
+      unsigned pad_h = ((out_h - 1) * stride_h + k_h - in_h);
+      unsigned pad_w = ((out_w - 1) * stride_w + k_w - in_w);
+      pad_top = pad_h / 2;
+      pad_bottom = pad_h - pad_top;
+      pad_left = pad_w / 2;
+      pad_right = pad_w - pad_left;
   } else {
-      pad_h = 0;
-      pad_w = 0;
+      pad_top = pad_bottom = pad_left = pad_right = 0;
   }

-   DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, pad=%u,%u -> %ux%ux%u output\n",
-       in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c, stride_h, stride_w, pad_h, pad_w, out_h, out_w, out_c);
+   DBG("Conv: %ux%ux%u input, %ux%ux%ux%u kernel (k_in_c=%u), stride=%u,%u, "
+       "pad=%u/%u/%u/%u -> %ux%ux%u output\n",
+       in_h, in_w, in_c, k_h, k_w, k_in_c, out_c, k_in_c,
+       stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right,
+       out_h, out_w, out_c);
+
+   /*
+    * Store convolution dimensions in subgraph for use during invoke/read.
+    * MMALib expects CHW layout, so we compute aligned pitches.
+    */
+   unsigned spatial = in_w * in_h;
+   unsigned inChOffset = ALIGN_POT(spatial + 64, 64); /* extra padding (test 1005 pattern) */
+   unsigned outSpatial = out_w * out_h;
+   unsigned pitchC = ALIGN_POT(outSpatial + 64, 64); /* extra padding (test 1005 pattern) */
+
+   subgraph->conv_in_h = in_h;
+   subgraph->conv_in_w = in_w;
+   subgraph->conv_in_c = in_c;
+   subgraph->conv_out_h = out_h;
+   subgraph->conv_out_w = out_w;
+   subgraph->conv_out_c = out_c;
+   subgraph->conv_inChOffset = inChOffset;
+   subgraph->conv_pitchC = pitchC;

-   size_t input_size = in_h * in_w * in_c;
   size_t weight_size = k_h * k_w * k_in_c * out_c;
   size_t bias_size = out_c * sizeof(int32_t);
-   size_t output_size = out_h * out_w * out_c;

-   /* Allocate buffers - reuse coefs_rsrc for weights, create new one for bias if needed */
-   subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(input_size, 1024));
-   subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(weight_size, 1024));
-   subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(output_size, 1024));
-   
-   /* Create bias buffer */
-   subgraph->bias_rsrc = NULL;
-   if (bias && bias->resource) {
-      subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, MAX2(bias_size, 1024));
-   }
+   /* Input buffer: CHW layout [in_c][inChOffset] */
+   size_t input_chw_size = (size_t)in_c * inChOffset;
+   subgraph->input_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                             MAX2(input_chw_size, 1024));

-   /* Transpose weights from TFLite OHWI to kernel's expected HWIO layout */
-   assert(weights->resource);
-   signed char *weights_ohwi = malloc(weight_size);
-   signed char *weights_hwio = malloc(weight_size);
-   
-   pipe_buffer_read(pcontext, weights->resource, 0, weight_size, weights_ohwi);
-   
-   /* Transpose: OHWI -> HWIO */
-   for (unsigned oc = 0; oc < out_c; oc++) {
-      for (unsigned kh = 0; kh < k_h; kh++) {
-         for (unsigned kw = 0; kw < k_w; kw++) {
-            for (unsigned ic = 0; ic < k_in_c; ic++) {
-               unsigned ohwi_idx = oc * k_h * k_w * k_in_c + kh * k_w * k_in_c + kw * k_in_c + ic;
-               unsigned hwio_idx = kh * k_w * k_in_c * out_c + kw * k_in_c * out_c + ic * out_c + oc;
-               weights_hwio[hwio_idx] = weights_ohwi[ohwi_idx];
-            }
-         }
+   /* Output buffer: CHW layout [out_c][pitchC] */
+   size_t output_chw_size = (size_t)out_c * pitchC;
+   subgraph->output_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                              MAX2(output_chw_size, 1024));
+
+   /*
+    * Weights: TFLite provides uint8 weights with a per-tensor zero_point.
+    * MMALib expects signed int8 weights with symmetric quantization (zp=0).
+    *
+    * When the TFLite weight_zp != 0, naively subtracting weight_zp and
+    * clamping to [-128,127] is lossy (e.g. weight_zp=133 clamps values
+    * 0–4).  Instead, following TI's TIDL approach, we:
+    *
+    *   1. Dequantize weights to float: float_w = (uint8_w - weight_zp) * weight_scale
+    *   2. Re-quantize per-channel with symmetric int8 (zp=0):
+    *        new_scale[c] = max(|float_w[c]|) / 127
+    *        int8_w = round(float_w / new_scale[c])   — always fits in [-127,127]
+    *   3. Compute per-channel bias adjustment and scale/shift using
+    *      new_scale[c] instead of the original weight_scale.
+    *
+    * This eliminates clamping entirely and gives exact results.
+    */
+   int input_zp = input->zero_point;   /* typically 128 for uint8 */
+   int weight_zp = weights->zero_point; /* typically non-zero for uint8 weights */
+   int output_zp = output->zero_point;  /* typically 0 */
+
+   DBG("Zero points: input_zp=%d, weight_zp=%d, output_zp=%d\n",
+       input_zp, weight_zp, output_zp);
+
+   /* Read raw uint8 weights from TFLite resource */
+   uint8_t *raw_weights = malloc(weight_size);
+   pipe_buffer_read(pcontext, weights->resource, 0, weight_size, raw_weights);
+
+   unsigned kDim = k_h * k_w * k_in_c;  /* elements per output filter */
+
+   /*
+    * Step 1: Dequantize weights to float.
+    * float_weight = (uint8_weight - weight_zp) * weight_scale
+    */
+   float *float_weights = malloc(weight_size * sizeof(float));
+   float orig_weight_scale = weights->scale;
+
+   for (unsigned o = 0; o < out_c; o++) {
+      for (unsigned i = 0; i < kDim; i++) {
+         float_weights[o * kDim + i] =
+            ((float)raw_weights[o * kDim + i] - (float)weight_zp) * orig_weight_scale;
      }
   }
-   
-   pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, weights_hwio);
-   free(weights_ohwi);
-   free(weights_hwio);
+   free(raw_weights);

-   /* Copy bias data from TFLite model */
-   if (subgraph->bias_rsrc) {
-      pipe_buffer_copy(pcontext, subgraph->bias_rsrc, bias->resource, 0, 0, bias_size);
-      
-      /* Debug: print first few bias values */
-      int32_t *bias_data = malloc(bias_size);
-      pipe_buffer_read(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
-      DBG("Bias values (int32): %d %d %d %d...\n", bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
-      free(bias_data);
+   /*
+    * Step 2: Re-quantize per-channel with symmetric int8 (zp=0).
+    * For each output channel, find the max absolute float value,
+    * compute new_scale = max_abs / 127, and quantize.
+    * This guarantees all values fit in [-127, 127] with no clamping.
+    */
+   int8_t *sym_weights = malloc(weight_size);
+   float *per_ch_weight_scale = malloc(out_c * sizeof(float));
+   int32_t *weight_sums = calloc(out_c, sizeof(int32_t));
+
+   for (unsigned o = 0; o < out_c; o++) {
+      /* Find max absolute value for this output channel */
+      float max_abs = 0.0f;
+      for (unsigned i = 0; i < kDim; i++) {
+         float absv = fabsf(float_weights[o * kDim + i]);
+         if (absv > max_abs)
+            max_abs = absv;
+      }
+
+      /* Compute per-channel symmetric scale */
+      if (max_abs > 0.0f) {
+         per_ch_weight_scale[o] = max_abs / 127.0f;
+      } else {
+         per_ch_weight_scale[o] = 1.0f; /* avoid division by zero */
+      }
+
+      /* Quantize to int8 — guaranteed no clamping needed */
+      int32_t sum = 0;
+      for (unsigned i = 0; i < kDim; i++) {
+         int val = (int)roundf(float_weights[o * kDim + i] / per_ch_weight_scale[o]);
+         /* Should always be in [-127, 127] but clamp defensively */
+         if (val > 127) val = 127;
+         if (val < -128) val = -128;
+         sym_weights[o * kDim + i] = (int8_t)val;
+         sum += sym_weights[o * kDim + i];
+      }
+      weight_sums[o] = sum;
+   }
+   free(float_weights);
+
+   if (DBG_ENABLED(THAMES_DBG_MSGS)) {
+      DBG("Symmetric re-quantization: weight_zp=%d, orig_scale=%f\n",
+          weight_zp, orig_weight_scale);
+      DBG("Per-channel weight scales (first 4): %f %f %f %f\n",
+          per_ch_weight_scale[0], per_ch_weight_scale[1],
+          per_ch_weight_scale[2], per_ch_weight_scale[3]);
+      DBG("First 8 symmetric weights: %d %d %d %d %d %d %d %d\n",
+          sym_weights[0], sym_weights[1], sym_weights[2], sym_weights[3],
+          sym_weights[4], sym_weights[5], sym_weights[6], sym_weights[7]);
+      DBG("Weight sums (first 4 channels): %d %d %d %d\n",
+          weight_sums[0], weight_sums[1], weight_sums[2], weight_sums[3]);
   }

-   /* Build params buffer: input/weight/bias/output IOVAs + conv_params + quant IOVAs */
-   uint64_t main_params[6] = {
-      thames_resource(subgraph->input_rsrc)->iova,
-      thames_resource(subgraph->coefs_rsrc)->iova,
-      subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
-      thames_resource(subgraph->output_rsrc)->iova,
-      0,  /* Will be filled with conv_params IOVA below */
-      0   /* Will be filled with quant IOVA below */
-   };
+   subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                             MAX2(weight_size, 1024));
+   pipe_buffer_write(pcontext, subgraph->coefs_rsrc, 0, weight_size, sym_weights);
+   free(sym_weights);

-   /* Create separate buffer for convolution parameters */
+   /*
+    * Bias: adjust to fold in input and output zero points.
+    *
+    * TI's TIDL formula (from tidl_import_quantize.cpp):
+    *   nScale[o] = S_y / (S_x * S_w[o])
+    *   finalBias[o] = originalBias[o] + (z_y * nScale[o] - z_x * Σ(weight_s8[o][i]))
+    *
+    * With per-channel symmetric re-quantization, S_w[o] = per_ch_weight_scale[o].
+    */
+   subgraph->bias_rsrc = NULL;
+   if (bias && bias->resource) {
+      int32_t *bias_data = malloc(bias_size);
+      pipe_buffer_read(pcontext, bias->resource, 0, bias_size, bias_data);
+
+      for (unsigned o = 0; o < out_c; o++) {
+         /* nScale = S_y / (S_x * S_w[o]) — per-channel output-to-accumulator ratio */
+         double nScale = (double)output->scale /
+                         ((double)input->scale * (double)per_ch_weight_scale[o]);
+
+         double final_bias = (double)bias_data[o]
+                             + (output_zp * nScale)
+                             - ((double)input_zp * weight_sums[o]);
+         double abs_bias = final_bias < 0 ? -final_bias : final_bias;
+         if (abs_bias > (double)2147483647) {
+            mesa_logw("Thames: bias overflow on channel %u, zeroing weights", o);
+            final_bias = final_bias / nScale;
+            final_bias = final_bias * (double)output->scale / (double)input->scale;
+         }
+         bias_data[o] = (int32_t)round(final_bias);
+      }
+
+      if (DBG_ENABLED(THAMES_DBG_MSGS)) {
+         DBG("Adjusted bias values (int32): %d %d %d %d...\n",
+             bias_data[0], bias_data[1], bias_data[2], bias_data[3]);
+      }
+
+      subgraph->bias_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                               MAX2(bias_size, 1024));
+      pipe_buffer_write(pcontext, subgraph->bias_rsrc, 0, bias_size, bias_data);
+      free(bias_data);
+   }
+   free(weight_sums);
+
+   /* Store input_zp for padFillValue in the DSP kernel */
+   subgraph->conv_input_zp = input_zp;
+   subgraph->conv_output_zp = output_zp;
+
+   /*
+    * Quantization parameters for MMALib — per-channel.
+    *
+    * MMALib uses per-channel uint8 scale and uint8 shift for output requantization:
+    *   output[c] = clamp((acc[c] * scale[c] + rounding) >> shift[c], min, max)
+    *
+    * Per-channel scale ratio:  scaleRatio[c] = S_y / (S_x * S_w[c])
+    * We find uint8 scale and uint8 shift per channel such that:
+    *   scale / 2^shift ≈ scaleRatio[c]
+    *
+    * This follows TI's TIDL_getMMAv2_ScaleShiftAndError() algorithm.
+    */
+   size_t quant_buf_size = 2 * out_c; /* [scale_0..scale_N-1][shift_0..shift_N-1] */
+   uint8_t *quant_data = calloc(1, quant_buf_size);
+
+   for (unsigned ch = 0; ch < out_c; ch++) {
+      double scale_ratio = (double)output->scale /
+                           ((double)input->scale * (double)per_ch_weight_scale[ch]);
+
+      /*
+       * Find best (scale, shift) pair using TI's brute-force approach:
+       * For each possible scale value (1..255), compute the optimal shift as
+       *   shift = round(ln(scale / scaleRatio) / ln(2))
+       * then check the approximation error |scaleRatio - scale/2^shift|.
+       */
+      uint8_t best_shift = 0;
+      uint8_t best_scale = 1;
+      double min_error = 1e30;
+      for (int s_iter = 1; s_iter <= 255; s_iter++) {
+         int shift_bits = (int)round(log((double)s_iter / scale_ratio) / log(2.0));
+         if (shift_bits > 40) shift_bits = 40;
+         if (shift_bits < 0) shift_bits = 0;
+         double approx = (double)s_iter / pow(2.0, shift_bits);
+         double err = fabs(scale_ratio - approx);
+         if (err < min_error) {
+            min_error = err;
+            best_shift = (uint8_t)shift_bits;
+            best_scale = (uint8_t)s_iter;
+         }
+      }
+
+      quant_data[ch] = best_scale;
+      quant_data[out_c + ch] = best_shift;
+
+      if (ch < 4 && DBG_ENABLED(THAMES_DBG_MSGS)) {
+         DBG("Channel %u: weight_scale=%f, scale_ratio=%f -> scale=%u, shift=%u (eff=%f, err=%e)\n",
+             ch, per_ch_weight_scale[ch], scale_ratio,
+             best_scale, best_shift, (double)best_scale / pow(2.0, best_shift), min_error);
+      }
+   }
+   free(per_ch_weight_scale);
+   subgraph->quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                             MAX2(quant_buf_size, 64));
+   pipe_buffer_write(pcontext, subgraph->quant_rsrc, 0, quant_buf_size, quant_data);
+   free(quant_data);
+
+   /*
+    * Convolution parameters buffer.
+    * Layout must match what the DSP kernel expects (see test_kernel.c header).
+    */
   uint32_t conv_params[15] = {
      in_h, in_w, in_c,
      k_h, k_w,
      out_h, out_w, out_c,
      stride_h, stride_w,
-      pad_h, pad_w,
-      (uint32_t)input->zero_point,
-      (uint32_t)weights->zero_point,
-      (uint32_t)output->zero_point
+      pad_top, pad_bottom, pad_left, pad_right,
+      (uint32_t)input_zp, /* input zero point for padFillValue */
   };

-   /* Compute fixed-point multiplier and shift for requantization.
-    * Use TFLite's QuantizeMultiplier approach: shift clamped to [0, 31],
-    * multiplier adjusted to fit.
+   subgraph->conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                                   sizeof(conv_params));
+   pipe_buffer_write(pcontext, subgraph->conv_params_rsrc, 0,
+                     sizeof(conv_params), conv_params);
+
+   DBG("Conv params: in=%ux%ux%u k=%ux%u out=%ux%ux%u stride=%u,%u pad=%u/%u/%u/%u\n",
+       in_h, in_w, in_c, k_h, k_w, out_h, out_w, out_c,
+       stride_h, stride_w, pad_top, pad_bottom, pad_left, pad_right);
+   DBG("CHW layout: inChOffset=%u pitchC=%u input_chw_size=%zu output_chw_size=%zu\n",
+       inChOffset, pitchC, input_chw_size, output_chw_size);
+
+   /*
+    * Build main params buffer: 6 IOVAs that args[0..5] will point to.
    */
-   double effective_scale = (double)(input->scale * weights->scale) / (double)output->scale;
-   
-   int exponent;
-   double significand = frexp(effective_scale, &exponent);  /* significand in [0.5, 1.0) */
-   
-   /* Compute target shift: 31 - exponent */
-   int target_shift = 31 - exponent;
-   uint32_t shift;
-   int32_t multiplier;
-   
-   if (target_shift < 0) {
-      /* Scale >= 1.0, use shift=0 and scale down multiplier */
-      shift = 0;
-      multiplier = (int32_t)round(effective_scale * (1LL << 31));
-   } else if (target_shift > 31) {
-      /* Scale very small, clamp shift to 31 and scale down multiplier */
-      shift = 31;
-      /* multiplier = significand * 2^31 * 2^(31 - target_shift) */
-      double scaled_sig = significand * exp2(31 - target_shift);
-      multiplier = (int32_t)round(scaled_sig * (1LL << 31));
-   } else {
-      shift = target_shift;
-      multiplier = (int32_t)round(significand * (1LL << 31));
-   }
-   
-   DBG("Quantization: effective_scale=%f, multiplier=%d (0x%x), shift=%u\n", 
-       effective_scale, multiplier, multiplier, shift);
-   DBG("Input scale=%f, weight scale=%f, output scale=%f\n",
-       input->scale, weights->scale, output->scale);
-   DBG("Conv params being sent: input_zp=%u, weight_zp=%u, output_zp=%u\n",
-       (uint32_t)input->zero_point, (uint32_t)weights->zero_point, (uint32_t)output->zero_point);
-   DBG("Multiplier=%d (0x%x), shift=%u, output_zp=%u\n", multiplier, multiplier, shift, (uint32_t)output->zero_point);
-
-   /* Create buffer for quantization params (multiplier, shift) */
-   int32_t quant_params[2] = {
-      multiplier,
-      (int32_t)shift
+   uint64_t main_params[6] = {
+      thames_resource(subgraph->input_rsrc)->iova,
+      thames_resource(subgraph->coefs_rsrc)->iova,
+      subgraph->bias_rsrc ? thames_resource(subgraph->bias_rsrc)->iova : 0,
+      thames_resource(subgraph->output_rsrc)->iova,
+      thames_resource(subgraph->conv_params_rsrc)->iova,
+      thames_resource(subgraph->quant_rsrc)->iova,
   };

-   struct pipe_resource *conv_params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(conv_params));
-   pipe_buffer_write(pcontext, conv_params_rsrc, 0, sizeof(conv_params), conv_params);
-   main_params[4] = thames_resource(conv_params_rsrc)->iova;
-
-   struct pipe_resource *quant_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(quant_params));
-   pipe_buffer_write(pcontext, quant_rsrc, 0, sizeof(quant_params), quant_params);
-   main_params[5] = thames_resource(quant_rsrc)->iova;
-
-   subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, sizeof(main_params));
-   pipe_buffer_write(pcontext, subgraph->params_rsrc, 0, sizeof(main_params), main_params);
-
-   pipe_resource_reference(&conv_params_rsrc, NULL);
-   pipe_resource_reference(&quant_rsrc, NULL);
+   subgraph->params_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT,
+                                              sizeof(main_params));
+   pipe_buffer_write(pcontext, subgraph->params_rsrc, 0,
+                     sizeof(main_params), main_params);

   return &subgraph->base;
 }
@ -441,21 +595,50 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
   struct timespec start, end;
   int ret;

+   unsigned in_h = subgraph->conv_in_h;
+   unsigned in_w = subgraph->conv_in_w;
+   unsigned in_c = subgraph->conv_in_c;
+   unsigned inChOffset = subgraph->conv_inChOffset;
+
+   /*
+    * Transpose input from NHWC to CHW for MMALib.
+    *
+    * Source (NHWC): [H][W][C] contiguous, size = H * W * C
+    * Destination (CHW): [C][inChOffset] where inChOffset = ALIGN(H*W, 64)
+    *
+    * Each channel plane has H*W valid pixels followed by padding zeros.
+    */
+   size_t input_chw_size = (size_t)in_c * inChOffset;
+   uint8_t *chw_buf = calloc(1, input_chw_size); /* calloc zeros the padding */
+
   for (unsigned i = 0; i < inputs_count; i++) {
-      struct thames_tensor *input = thames_find_tensor(subgraph, input_idxs[i]);
-      assert(input);
+      const uint8_t *nhwc = (const uint8_t *)inputs[i];
+
+      for (unsigned c = 0; c < in_c; c++) {
+         for (unsigned h = 0; h < in_h; h++) {
+            for (unsigned w = 0; w < in_w; w++) {
+               unsigned nhwc_idx = (h * in_w + w) * in_c + c;
+               unsigned chw_idx = c * inChOffset + h * in_w + w;
+               chw_buf[chw_idx] = nhwc[nhwc_idx];
+            }
+         }
+      }
+
+      struct thames_tensor *input_tensor = thames_find_tensor(subgraph, input_idxs[i]);
+      assert(input_tensor);

      if (DBG_ENABLED(THAMES_DBG_DUMP_BOS))
-         thames_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
+         thames_dump_buffer(inputs[i], "input-nhwc", 0, 0, 0, in_h * in_w * in_c);

-      pipe_buffer_write(pcontext, subgraph->input_rsrc, input->offset, input->size, inputs[i]);
+      pipe_buffer_write(pcontext, subgraph->input_rsrc, 0, input_chw_size, chw_buf);
   }
+   free(chw_buf);

   if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
      struct pipe_transfer *transfer_in;
      uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->input_rsrc,
                                     PIPE_MAP_READ, &transfer_in);
-      thames_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->input_rsrc));
+      thames_dump_buffer(buf, "input-chw", 0, 0, 0, input_chw_size);
      pipe_buffer_unmap(subgraph->base.context, transfer_in);
   }

@ -465,13 +648,20 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
   job.params = thames_resource(subgraph->params_rsrc)->handle;
   job.params_size = pipe_buffer_size(subgraph->params_rsrc);

-   /* Pass both input buffers (input_a and input_b) as input BOs */
-   uint32_t in_bo_handles[2] = {
-      thames_resource(subgraph->input_rsrc)->handle,
-      thames_resource(subgraph->coefs_rsrc)->handle,
-   };
+   /*
+    * Pass all input BOs the kernel will access.
+    * The DRM driver needs to know about them for cache management.
+    */
+   uint32_t in_bo_handles[4];
+   unsigned in_bo_count = 0;
+   in_bo_handles[in_bo_count++] = thames_resource(subgraph->input_rsrc)->handle;
+   in_bo_handles[in_bo_count++] = thames_resource(subgraph->coefs_rsrc)->handle;
+   if (subgraph->bias_rsrc)
+      in_bo_handles[in_bo_count++] = thames_resource(subgraph->bias_rsrc)->handle;
+   in_bo_handles[in_bo_count++] = thames_resource(subgraph->quant_rsrc)->handle;
+
   job.in_bo_handles = (uintptr_t)in_bo_handles;
-   job.in_bo_handle_count = 2;
+   job.in_bo_handle_count = in_bo_count;

   job.out_bo_handles = (uintptr_t)&thames_resource(subgraph->output_rsrc)->handle;
   job.out_bo_handle_count = 1;
@ -499,11 +689,12 @@ thames_ml_subgraph_invoke(struct pipe_context *pcontext,
      DBG("Input hex:  %s\n", hexbuf);
      pipe_buffer_unmap(subgraph->base.context, transfer_in);

-      uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_in);
+      struct pipe_transfer *transfer_coefs;
+      uint8_t *coefsbuf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, PIPE_MAP_READ, &transfer_coefs);
      for (int i = 0; i < 32; i++)
         snprintf(hexbuf + i * 3, 4, "%02x ", coefsbuf[i]);
      DBG("Coefs hex:  %s\n", hexbuf);
-      pipe_buffer_unmap(subgraph->base.context, transfer_in);
+      pipe_buffer_unmap(subgraph->base.context, transfer_coefs);

      /* Read output buffer */
      struct pipe_transfer *transfer_out;
@ -531,21 +722,44 @@ thames_ml_subgraph_read_outputs(struct pipe_context *pcontext,
   struct thames_subgraph *subgraph = (struct thames_subgraph *)(psubgraph);
   uint8_t **outputs = (uint8_t **)outputsv;

-   for (int i = 0; i < outputs_count; i++) {
-      struct thames_tensor *output = thames_find_tensor(subgraph, output_idxs[i]);
+   unsigned out_h = subgraph->conv_out_h;
+   unsigned out_w = subgraph->conv_out_w;
+   unsigned out_c = subgraph->conv_out_c;
+   unsigned pitchC = subgraph->conv_pitchC;
+   size_t output_chw_size = (size_t)out_c * pitchC;

+   for (int i = 0; i < outputs_count; i++) {
      if (DBG_ENABLED(THAMES_DBG_DUMP_BOS)) {
         struct pipe_transfer *transfer_in;
         uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->output_rsrc,
                                        PIPE_MAP_READ, &transfer_in);
-         thames_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->output_rsrc));
+         thames_dump_buffer(buf, "output-chw", 0, 0, 0, output_chw_size);
         pipe_buffer_unmap(subgraph->base.context, transfer_in);
      }

-      /* For test kernel phase: output is at offset 0, size is the buffer size */
-      unsigned size = pipe_buffer_size(subgraph->output_rsrc);
-      DBG("Reading output %u from offset 0, size %u\n", output_idxs[i], size);
-      pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, size, outputs[i]);
+      /*
+       * Read CHW output from DSP and transpose to NHWC for TFLite.
+       *
+       * Source (CHW): [out_c][pitchC] where each channel has out_h*out_w valid pixels
+       * Destination (NHWC): [out_h][out_w][out_c] contiguous
+       */
+      uint8_t *chw_out = malloc(output_chw_size);
+      pipe_buffer_read(pcontext, subgraph->output_rsrc, 0, output_chw_size, chw_out);
+
+      for (unsigned c = 0; c < out_c; c++) {
+         for (unsigned h = 0; h < out_h; h++) {
+            for (unsigned w = 0; w < out_w; w++) {
+               unsigned chw_idx = c * pitchC + h * out_w + w;
+               unsigned nhwc_idx = (h * out_w + w) * out_c + c;
+               outputs[i][nhwc_idx] = chw_out[chw_idx];
+            }
+         }
+      }
+
+      free(chw_out);
+
+      DBG("Reading output %u: CHW %ux%u -> NHWC %ux%ux%u (%u bytes)\n",
+          output_idxs[i], out_c, pitchC, out_h, out_w, out_c, out_h * out_w * out_c);
   }
 }

@ -561,6 +775,8 @@ thames_ml_subgraph_destroy(struct pipe_context *pcontext,
   pipe_resource_reference(&subgraph->bias_rsrc, NULL);
   pipe_resource_reference(&subgraph->kernel_rsrc, NULL);
   pipe_resource_reference(&subgraph->params_rsrc, NULL);
+   pipe_resource_reference(&subgraph->quant_rsrc, NULL);
+   pipe_resource_reference(&subgraph->conv_params_rsrc, NULL);

   util_dynarray_fini(&subgraph->operations);
   util_dynarray_fini(&subgraph->tensors);
--- a/src/gallium/drivers/thames/thames_ml.h
+++ b/src/gallium/drivers/thames/thames_ml.h
@ -186,8 +186,18 @@ struct thames_subgraph {
   uint8_t *coefs;
   struct pipe_resource *coefs_rsrc;
   unsigned coefs_used;
-   
+
   struct pipe_resource *bias_rsrc;  /* Bias tensor (int32) */
+   struct pipe_resource *quant_rsrc; /* Scale + shift (uint8 packed) */
+   struct pipe_resource *conv_params_rsrc; /* Convolution parameters */
+
+   /* Convolution dimensions (needed for NHWC<->CHW transpose at invoke/read) */
+   unsigned conv_in_h, conv_in_w, conv_in_c;
+   unsigned conv_out_h, conv_out_w, conv_out_c;
+   unsigned conv_inChOffset; /* ALIGN(in_h*in_w + 64, 64) */
+   unsigned conv_pitchC;     /* ALIGN(out_h*out_w + 64, 64) */
+   int conv_input_zp;        /* Input zero point (for padFillValue) */
+   int conv_output_zp;       /* Output zero point (post-add) */
 };

 bool