From 41531544232a94030583295e498f7888e302a83a Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Thu, 19 Sep 2024 16:15:36 +0200
Subject: [PATCH] etnaviv/nn: Add support for signed 8-bit tensors

The hardware only supports unsigned 8-bit tensors, but with the
configurable zero point we can map signed 8-bit integers to unsigned
8-bit integers by adding a constant offset of 128 to all values and to
the zero point setting.

This requires adding 128 to all input tensors and subtracting 128
from all output tensors during inference.

Reviewed-by: Tomeu Vizoso <tomeu@tomeuvizoso.net>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31979>
---
 src/gallium/drivers/etnaviv/etnaviv_ml.c      | 31 +++++++++++-
 src/gallium/drivers/etnaviv/etnaviv_ml.h      |  1 +
 src/gallium/drivers/etnaviv/etnaviv_ml_nn.c   | 49 +++++++++++++++----
 .../drivers/etnaviv/etnaviv_ml_nn_v8.c        | 15 +++++-
 src/gallium/drivers/etnaviv/etnaviv_ml_tp.c   | 20 ++++++--
 5 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.c b/src/gallium/drivers/etnaviv/etnaviv_ml.c
index 4934d1afa47..2b29ef28ba3 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.c
@@ -542,7 +542,19 @@ etna_ml_subgraph_invoke(struct pipe_context *pctx, struct pipe_ml_subgraph *psub
 
    for (int i = 0; i < inputs_count; i++) {
       struct pipe_resource *res = etna_ml_get_tensor(subgraph, input_idxs[i]);
-      pipe_buffer_write(pctx, res, offsets[input_idxs[i]], sizes[input_idxs[i]], inputs[i]);
+      if (is_signed[i]) {
+         struct pipe_transfer *dst_transfer;
+         const uint8_t *src = inputs[i];
+         uint8_t *dst_map;
+         dst_map = pipe_buffer_map_range(pctx, res, 0, sizes[input_idxs[i]], PIPE_MAP_WRITE, &dst_transfer);
+         assert(dst_map);
+         for (unsigned k = 0; k < sizes[input_idxs[i]]; k++) {
+            dst_map[k] = src[k] + 128;
+         }
+         pipe_buffer_unmap(pctx, dst_transfer);
+      } else {
+         pipe_buffer_write(pctx, res, offsets[input_idxs[i]], sizes[input_idxs[i]], inputs[i]);
+      }
    }
 
    unsigned i = 0;
@@ -662,7 +674,22 @@ etna_ml_subgraph_read_outputs(struct pipe_context *context, struct pipe_ml_subgr
 
    for (int i = 0; i < outputs_count; i++) {
       struct pipe_resource *res = etna_ml_get_tensor(subgraph, output_idxs[i]);
-      pipe_buffer_read(context, res, 0, pipe_buffer_size(res), outputs[i]);
+      if (is_signed[i]) {
+         struct pipe_transfer *src_transfer;
+         uint8_t *src_map;
+         src_map = (uint8_t *) pipe_buffer_map_range(context,
+                                                     res,
+                                                     0, pipe_buffer_size(res),
+                                                     PIPE_MAP_READ,
+                                                     &src_transfer);
+         assert(src_map);
+         for (unsigned k = 0; k < pipe_buffer_size(res); k++) {
+            ((uint8_t *)(outputs[i]))[k] = src_map[k] - 128;
+         }
+         pipe_buffer_unmap(context, src_transfer);
+      } else {
+         pipe_buffer_read(context, res, 0, pipe_buffer_size(res), outputs[i]);
+      }
    }
 }
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml.h b/src/gallium/drivers/etnaviv/etnaviv_ml.h
index e5d2a18dfc0..3b05618f13d 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml.h
@@ -94,6 +94,7 @@ struct etna_operation {
    unsigned weight_height;
    uint8_t weight_zero_point;
    float weight_scale;
+   bool weight_signed;
 
    uint8_t addition_offset;
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
index 66149229ca4..858409f990d 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@@ -199,9 +199,15 @@ pointwise_to_2x2(struct etna_ml_subgraph *subgraph, struct etna_operation *opera
       uint8_t *map_out = output + channel * 2 * 2 * operation->input_channels;
 
       map_out[0] = map_in[0];
-      map_out[1] = operation->weight_zero_point;
-      map_out[2] = operation->weight_zero_point;
-      map_out[3] = operation->weight_zero_point;
+      if (operation->weight_signed) {
+         map_out[1] = operation->weight_zero_point - 128;
+         map_out[2] = operation->weight_zero_point - 128;
+         map_out[3] = operation->weight_zero_point - 128;
+      } else {
+         map_out[1] = operation->weight_zero_point;
+         map_out[2] = operation->weight_zero_point;
+         map_out[3] = operation->weight_zero_point;
+      }
    }
 
    pipe_resource_reference(&operation->weight_tensor, NULL);
@@ -231,6 +237,8 @@ expand_depthwise(struct etna_ml_subgraph *subgraph, struct etna_operation *opera
       for (unsigned i = 0; i < operation->weight_width * operation->weight_height * operation->input_channels; i++) {
          if (i % operation->input_channels == in_depth)
             map_out[i] = map_in[i];
+         else if (operation->weight_signed)
+            map_out[i] = operation->weight_zero_point - 128;
          else
             map_out[i] = operation->weight_zero_point;
       }
@@ -380,7 +388,8 @@ strided_to_normal(struct etna_ml_subgraph *subgraph, struct etna_operation *oper
    output = map_resource(output_res);
 
    unsigned wdims_out[4] = {operation->output_channels, operation->weight_width, operation->weight_height, operation->input_channels};
-   reshape(input, output, operation->stride, operation->weight_zero_point, wdims_in, wdims_out);
+   int weight_zero_point = operation->weight_signed ? (operation->weight_zero_point - 128) : operation->weight_zero_point;
+   reshape(input, output, operation->stride, weight_zero_point, wdims_in, wdims_out);
 
    pipe_resource_reference(&operation->weight_tensor, NULL);
    operation->weight_tensor = output_res;
@@ -415,6 +424,25 @@ calc_pooling_first_pixel(struct etna_ml_subgraph *subgraph,
    return false;
 }
 
+static inline uint8_t
+etna_tensor_zero_point(struct pipe_tensor *tensor)
+{
+   if (tensor->is_signed) {
+      /*
+       * Since the hardware only supports unsigned 8-bit integers, signed
+       * tensors are shifted from the -128..127 range to 0..255 by adding 128
+       * when uploading and subtracting 128 when downloading the tensor.
+       * Tensor zero point and weight coefficients have to be adapted to
+       * account for this.
+       */
+      assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
+      return tensor->zero_point + 128;
+   } else {
+      assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
+      return tensor->zero_point;
+   }
+}
+
 void
 etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
                           const struct pipe_ml_operation *poperation,
@@ -442,21 +470,22 @@ etna_ml_lower_convolution(struct etna_ml_subgraph *subgraph,
    operation->input_width = poperation->input_tensors[0]->dims[1];
    operation->input_height = poperation->input_tensors[0]->dims[2];
    operation->input_channels = poperation->input_tensors[0]->dims[3];
-   operation->input_zero_point = poperation->input_tensors[0]->zero_point;
+   operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
    operation->input_scale = poperation->input_tensors[0]->scale;
 
    operation->output_tensor = poperation->output_tensors[0]->index;
    operation->output_width = poperation->output_tensors[0]->dims[1];
    operation->output_height = poperation->output_tensors[0]->dims[2];
    operation->output_channels = poperation->output_tensors[0]->dims[3];
-   operation->output_zero_point = poperation->output_tensors[0]->zero_point;
+   operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
    operation->output_scale = poperation->output_tensors[0]->scale;
 
    pipe_resource_reference(&operation->weight_tensor, poperation->conv.weight_tensor->resource);
    operation->weight_width = poperation->conv.weight_tensor->dims[1];
    operation->weight_height = poperation->conv.weight_tensor->dims[2];
-   operation->weight_zero_point = poperation->conv.weight_tensor->zero_point;
+   operation->weight_zero_point = etna_tensor_zero_point(poperation->conv.weight_tensor);
    operation->weight_scale = poperation->conv.weight_tensor->scale;
+   operation->weight_signed = poperation->conv.weight_tensor->is_signed;
 
    pipe_resource_reference(&operation->bias_tensor, poperation->conv.bias_tensor->resource);
 
@@ -544,7 +573,7 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
    operation->input_width = poperation->input_tensors[0]->dims[1];
    operation->input_height = poperation->input_tensors[0]->dims[2];
    operation->input_channels = poperation->input_tensors[0]->dims[3];
-   operation->input_zero_point = poperation->input_tensors[0]->zero_point;
+   operation->input_zero_point = etna_tensor_zero_point(poperation->input_tensors[0]);
    operation->input_scale = poperation->input_tensors[0]->scale;
    operation->input_tensor_size = operation->input_width *
                                   operation->input_height *
@@ -555,7 +584,7 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
    operation->output_width = poperation->output_tensors[0]->dims[1];
    operation->output_height = poperation->output_tensors[0]->dims[2];
    operation->output_channels = poperation->output_tensors[0]->dims[3];
-   operation->output_zero_point = poperation->output_tensors[0]->zero_point;
+   operation->output_zero_point = etna_tensor_zero_point(poperation->output_tensors[0]);
    operation->output_scale = poperation->output_tensors[0]->scale;
 
    if (nn_core_version < 8) {
@@ -564,6 +593,7 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
       operation->weight_height = 2;
       operation->weight_zero_point = 0x0;
       operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
+      operation->weight_signed = false;
       operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
 
       uint8_t *weight_map = map_resource(operation->weight_tensor);
@@ -582,6 +612,7 @@ etna_ml_lower_add(struct etna_ml_subgraph *subgraph,
       operation->weight_height = 1;
       operation->weight_zero_point = 0x0;
       operation->weight_scale = compute_weight_scale_add(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale);
+      operation->weight_signed = false;
       operation->addition_offset = compute_addition_offset(poperation->input_tensors[1]->scale, poperation->input_tensors[0]->scale, operation->weight_scale);
 
       uint8_t (*weight_map)[operation->input_channels] = map_resource(operation->weight_tensor);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
index 6840301285c..f486f7994c7 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@@ -192,8 +192,17 @@ static uint32_t calculate_bias_correction(struct etna_ml_subgraph *subgraph, con
    else
       input_channels = operation->input_channels;
 
-   for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
-      correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
+   if (operation->weight_signed) {
+      /* See etna_tensor_zero_point() */
+      int8_t weight_zero_point = operation->weight_zero_point - 128;
+
+      for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
+         correction += (((int8_t *)weights)[i] - weight_zero_point) * input_zero_point;
+      }
+   } else {
+      for (unsigned i = 0; i < operation->weight_width * operation->weight_height * input_channels; i++) {
+         correction += (weights[i] - operation->weight_zero_point) * input_zero_point;
+      }
    }
 
    return correction;
@@ -652,6 +661,8 @@ static void encode_superblock(struct etna_ml_subgraph *subgraph, const struct et
 
             if (kernel_idx + block * block_size >= kernel_size)
                weight = operation->weight_zero_point;
+            else if (operation->weight_signed)
+               weight = ((int8_t *)(weights_map[oc]))[kernel_idx + block * block_size] + 128;
             else
                weight = weights_map[oc][kernel_idx + block * block_size];
 
diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
index c1a1d63c7d6..3e83105c91d 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_tp.c
@@ -535,6 +535,18 @@ create_reshuffle_config(struct etna_ml_subgraph *subgraph, const struct etna_ope
    return bo;
 }
 
+static inline uint8_t
+etna_tensor_zero_point(struct pipe_tensor *tensor)
+{
+   if (tensor->is_signed) {
+      assert(tensor->zero_point >= -128 && tensor->zero_point <= 127);
+      return tensor->zero_point + 128;
+   } else {
+      assert(tensor->zero_point >= 0 && tensor->zero_point <= 255);
+      return tensor->zero_point;
+   }
+}
+
 void
 etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
                         const struct pipe_ml_operation *first_operation,
@@ -548,7 +560,7 @@ etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
    operation->input_width = first_operation->input_tensors[0]->dims[1];
    operation->input_height = first_operation->input_tensors[0]->dims[2];
    operation->input_channels = first_operation->input_tensors[0]->dims[3];
-   operation->input_zero_point = first_operation->input_tensors[0]->zero_point;
+   operation->input_zero_point = etna_tensor_zero_point(first_operation->input_tensors[0]);
    operation->input_scale = first_operation->input_tensors[0]->scale;
    operation->input_tensor_size = operation->input_width *
                                   operation->input_height *
@@ -559,7 +571,7 @@ etna_ml_lower_transpose(struct etna_ml_subgraph *subgraph,
    operation->output_width = first_operation->input_tensors[0]->dims[1];
    operation->output_height = first_operation->input_tensors[0]->dims[2];
    operation->output_channels = first_operation->input_tensors[0]->dims[3];
-   operation->output_zero_point = first_operation->input_tensors[0]->zero_point;
+   operation->output_zero_point = etna_tensor_zero_point(first_operation->input_tensors[0]);
    operation->output_scale = first_operation->input_tensors[0]->scale;
 }
 
@@ -606,7 +618,7 @@ etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
    operation->input_width = convolution->input_tensors[0]->dims[1];
    operation->input_height = convolution->input_tensors[0]->dims[2];
    operation->input_channels = convolution->input_tensors[0]->dims[3];
-   operation->input_zero_point = convolution->input_tensors[0]->zero_point;
+   operation->input_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
    operation->input_scale = convolution->input_tensors[0]->scale;
    operation->input_tensor_size = operation->input_width *
                                   operation->input_height *
@@ -617,7 +629,7 @@ etna_ml_lower_reshuffle(struct etna_ml_subgraph *subgraph,
    operation->output_width = DIV_ROUND_UP(operation->input_width, operation->stride);
    operation->output_height = DIV_ROUND_UP(operation->input_height, operation->stride);
    operation->output_channels = operation->input_channels * operation->stride * operation->stride;
-   operation->output_zero_point = convolution->input_tensors[0]->zero_point;
+   operation->output_zero_point = etna_tensor_zero_point(convolution->input_tensors[0]);
    operation->output_scale = convolution->input_tensors[0]->scale;
 
    /* When destriding a convolution, the transformation to be made to the input