ethosu: Add support for per-channel quantization

For those models with coefficients that have different quantization parameters for each channel. The NPU can handle per-channel scales as can be seen in fill_scale_and_biases(), which already iterates per output channel. Activation tensors (input/output) don't have per-channel quantization. - Add scales/zero_points arrays to ethosu_kernel struct - Copy per-channel scales from weight tensor in lower pass - Use per-channel scale when computing conv_scale in coefs - Allow per-channel quantization in operation_supported check Signed-off-by: Anders Roxell <anders.roxell@linaro.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39594>
2026-05-07 02:48:06 +02:00 · 2026-01-15 15:50:18 +01:00 · 2026-01-15 15:50:18 +01:00 · 63c028b5e0
commit 63c028b5e0
parent 0887e6d89f
4 changed files with 33 additions and 24 deletions
--- a/src/gallium/drivers/ethosu/ethosu_coefs.c
+++ b/src/gallium/drivers/ethosu/ethosu_coefs.c
@ -22,7 +22,9 @@ fill_scale_and_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation

   for (unsigned i = 0; i < operation->ofm.shape.depth; i++) {
      uint64_t bias = biases[i];
-      double conv_scale = ((double)operation->ifm.scale * (double)operation->kernel.scale) / (double)operation->ofm.scale;
+      double kernel_scale = (operation->kernel.scales != NULL) ?
+                             operation->kernel.scales[i] : operation->kernel.scale;
+      double conv_scale = ((double)operation->ifm.scale * kernel_scale) / (double)operation->ofm.scale;
      uint32_t shift;
      int scale = ethosu_quantize_scale(conv_scale, &shift);

--- a/src/gallium/drivers/ethosu/ethosu_lower.c
+++ b/src/gallium/drivers/ethosu/ethosu_lower.c
@ -125,8 +125,6 @@ ethosu_lower_convolution(struct ethosu_subgraph *subgraph,
   operation->type = ETHOSU_OPERATION_TYPE_CONVOLUTION;

   operation->conv.depthwise = is_depthwise(poperation);
-   // operation->padding_same = poperation->conv.padding_same;
-   // operation->stride = poperation->conv.stride_x;

   set_feature_maps(input_tensor, poperation->output_tensors[0], operation);

@ -141,6 +139,24 @@ ethosu_lower_convolution(struct ethosu_subgraph *subgraph,
   operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point;
   operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed;

+   /* Per-channel quantization support */
+   struct pipe_tensor *weight = poperation->conv.weight_tensor;
+   if (weight->scales != NULL) {
+      unsigned num_channels = poperation->output_tensors[0]->dims[3];
+      operation->kernel.scales = malloc(num_channels * sizeof(float));
+      memcpy(operation->kernel.scales, weight->scales, num_channels * sizeof(float));
+
+      if (weight->zero_points != NULL) {
+         operation->kernel.zero_points = malloc(num_channels * sizeof(int));
+         memcpy(operation->kernel.zero_points, weight->zero_points, num_channels * sizeof(int));
+      } else {
+         operation->kernel.zero_points = NULL;
+      }
+   } else {
+      operation->kernel.scales = NULL;
+      operation->kernel.zero_points = NULL;
+   }
+
   operation->conv.part_kernel_first = ethosu_is_part_kernel_first(operation);

   if (poperation->conv.padding_same) {
--- a/src/gallium/drivers/ethosu/ethosu_ml.c
+++ b/src/gallium/drivers/ethosu/ethosu_ml.c
@ -137,16 +137,6 @@ ethosu_quantize_scale(double scale, uint32_t *shift)
   return quantized_scale;
 }

-static bool
-tensor_quantization_supported(struct pipe_tensor *tensor)
-{
-   /*
-    * Per-axis quantization not supported, for details see:
-    * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
-    */
-   return tensor->scales == NULL && tensor->zero_points == NULL;
-}
-
 bool
 ethosu_ml_operation_supported(struct pipe_context *pcontext,
                              const struct pipe_ml_operation *operation)
@ -155,17 +145,10 @@ ethosu_ml_operation_supported(struct pipe_context *pcontext,

   switch (operation->type) {
   case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
-      struct pipe_tensor *input_tensor = operation->input_tensors[0];
-      struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
-      struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
-      struct pipe_tensor *output_tensor = operation->output_tensors[0];
-
-      // Dilation and per-axis quantization not yet implemented
-      if (tensor_quantization_supported(input_tensor) &&
-          tensor_quantization_supported(weight_tensor) &&
-          tensor_quantization_supported(bias_tensor) &&
-          tensor_quantization_supported(output_tensor) &&
-          operation->conv.dilation_width_factor == 1 &&
+      /*
+       * Dilation is not yet implemented.
+       */
+      if (operation->conv.dilation_width_factor == 1 &&
          operation->conv.dilation_height_factor == 1)
         supported = true;

@ -356,7 +339,12 @@ ethosu_ml_subgraph_destroy(struct pipe_context *pcontext,
   ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
   assert(ret >= 0);

+   util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
+      free(operation->kernel.scales);
+      free(operation->kernel.zero_points);
+   }
   util_dynarray_fini(&subgraph->operations);
+
   util_dynarray_fini(&subgraph->tensors);

   free(subgraph);
--- a/src/gallium/drivers/ethosu/ethosu_ml.h
+++ b/src/gallium/drivers/ethosu/ethosu_ml.h
@ -82,6 +82,9 @@ struct ethosu_kernel {
   bool is_signed;
   unsigned zero_point;
   float scale;
+   /* Per-channel quantization (NULL for per-tensor) */
+   float *scales;
+   int *zero_points;
 };

 struct ethosu_padding {