From 2fe1301e5e8b631d8dc2567afa9c88688ec258d8 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Fri, 10 Apr 2026 15:19:10 -0500
Subject: [PATCH] ethosu: Add LeakyRelu operation

Add support for LeakyRelu operations. These are implemented as a pooling
LUT.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39975>
---
 src/gallium/drivers/ethosu/ethosu_lower.c | 115 ++++++++++++++++++++++
 src/gallium/drivers/ethosu/ethosu_ml.c    |   1 +
 2 files changed, 116 insertions(+)

diff --git a/src/gallium/drivers/ethosu/ethosu_lower.c b/src/gallium/drivers/ethosu/ethosu_lower.c
index 0e185b5d652..20b40cdd7c7 100644
--- a/src/gallium/drivers/ethosu/ethosu_lower.c
+++ b/src/gallium/drivers/ethosu/ethosu_lower.c
@@ -415,6 +415,85 @@ ethos_create_hswish_lut(struct ethosu_operation *operation, uint8_t *lut)
    }
 }
 
+static int32_t
+saturating_rounding_doubling_high_mul_32(int32_t a, int32_t b)
+{
+   bool overflow = a == b && a == INT32_MIN;
+   int64_t a_64 = a;
+   int64_t b_64 = b;
+   int64_t ab_64 = a_64 * b_64;
+   int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+   int32_t ab_x2_high32 = ((ab_64 + nudge) / (1ll << 31));
+   return overflow ? INT32_MAX : ab_x2_high32;
+}
+
+static int32_t
+rounding_divide_by_pow2_32(int32_t x, int exponent)
+{
+   const int32_t mask = (1 << exponent) - 1;
+   const int32_t remainder = x & mask;
+   const int32_t threshold = (mask >> 1) + ((x < 0) ? 1 : 0);
+   return (x >> exponent) + ((remainder > threshold) ? 1 : 0);
+}
+
+// Multiplies int with QuantizedScale with rounding.
+static int
+multiply_by_quantized_multiplier(int x, int shift, int32_t scale)
+{
+   // Multiplies x (int32) by QuantizedScale (scale, shift), returns rounded result.
+   // Expects the QuantizedScale to be left-shift positive.
+   const int leftShift = shift > 0 ? shift : 0;
+   const int rightShift = shift < 0 ? -shift : 0;
+   const int32_t mul = saturating_rounding_doubling_high_mul_32(x * (1 << leftShift), scale);
+   return rounding_divide_by_pow2_32(mul, rightShift);
+}
+
+static float
+clamp(double d)
+{
+   return (float)CLAMP(d, -FLT_MAX, FLT_MAX);
+}
+
+/* Calculate elementwise Mul OFM QuantizedScale */
+static int32_t
+elementwise_mul_scale(double inputScale, double input2Scale, double outputScale, int32_t *mul_shift)
+{
+   // clamp to single-point precision
+   float ifm1Scale = clamp(inputScale);
+   float ifm2Scale = clamp(input2Scale);
+   float outScale = clamp(outputScale);
+
+   float outputRescale = (ifm1Scale * ifm2Scale) / outScale;
+   return ethosu_quantize_scale(outputRescale, mul_shift, false);
+}
+
+static void
+ethos_create_leakyrelu_lut(struct ethosu_operation *operation, uint8_t *lut, float alpha)
+{
+   const double ifm_scale = operation->ifm.scale;
+   const double ofm_scale = operation->ofm.scale;
+   const int zpIn = operation->ifm.zero_point;
+   const int zpOut = operation->ofm.zero_point;
+   const int qMin = operation->ifm.is_signed ? -128 : 0;
+   const int qMax = operation->ifm.is_signed ? 127 : 255;
+   int64_t scalar = 1;
+   int32_t identity_shift;
+   int32_t identity_scale = elementwise_mul_scale(ifm_scale, 1.0, ofm_scale, &identity_shift);
+   int32_t alpha_shift;
+   int32_t alpha_scale = elementwise_mul_scale(ifm_scale, alpha, ofm_scale, &alpha_shift);
+
+   for (int x = qMin; x <= qMax; ++x, lut++) {
+      int lutResult;
+      if (x < zpIn)
+         lutResult = zpOut + multiply_by_quantized_multiplier((int)(scalar * (x - zpIn)), 31 - alpha_shift, alpha_scale);
+      else
+         lutResult = zpOut + multiply_by_quantized_multiplier((int)(x - zpIn), 31 - identity_shift, identity_scale);
+
+      lutResult = MIN2(qMax, MAX2(qMin, lutResult));
+      *lut = lutResult;
+   }
+}
+
 static void
 ethosu_lower_lut_dma(struct ethosu_subgraph *subgraph,
                      const struct pipe_ml_operation *poperation,
@@ -478,6 +557,31 @@ ethosu_lower_hswish(struct ethosu_subgraph *subgraph,
    ethosu_sched_operation(subgraph, operation);
 }
 
+static void
+ethosu_lower_leakyrelu(struct ethosu_subgraph *subgraph,
+                       const struct pipe_ml_operation *poperation,
+                       struct ethosu_operation *operation)
+{
+   uint8_t lut[LUT8_SIZE];
+
+   operation->type = ETHOSU_OPERATION_TYPE_POOLING;
+   operation->round_mode = ETHOSU_ROUNDING_NATURAL;
+   operation->pooling.type = ETHOSU_POOLING_TYPE_AVG;
+   operation->pooling.activation = ETHOSU_POOLING_ACTIVATION_LUT(0);
+
+   set_feature_maps(subgraph, poperation->input_tensors[0], poperation->output_tensors[0], operation);
+
+   ethos_create_leakyrelu_lut(operation, lut, poperation->leakyrelu.alpha);
+   fill_lut(subgraph, operation, lut);
+
+   /* The LUT handles 0 point and scale, so make them equal */
+   operation->ofm.zero_point = operation->ifm.zero_point;
+   operation->ofm.scale = operation->ifm.scale;
+
+   allocate_feature_maps(subgraph, operation);
+   ethosu_sched_operation(subgraph, operation);
+}
+
 static void
 ethosu_lower_concatenation(struct ethosu_subgraph *subgraph,
                            const struct pipe_ml_operation *poperation,
@@ -799,6 +903,17 @@ ethosu_lower_graph(struct ethosu_subgraph *subgraph,
          break;
       }
 
+      case PIPE_ML_OPERATION_TYPE_LEAKY_RELU: {
+         ethosu_lower_leakyrelu(subgraph, &poperations[i], &operation);
+
+         struct ethosu_operation dma_operation = {0};
+         ethosu_lower_lut_dma(subgraph, &poperations[i], &operation, &dma_operation);
+         util_dynarray_append(&subgraph->operations, dma_operation);
+
+         util_dynarray_append(&subgraph->operations, operation);
+         break;
+      }
+
       default:
          DBG("poperation->type %d\n", poperations[i].type);
          UNREACHABLE("Unsupported ML operation type");
diff --git a/src/gallium/drivers/ethosu/ethosu_ml.c b/src/gallium/drivers/ethosu/ethosu_ml.c
index a2fbed926bb..c5721dd21c5 100644
--- a/src/gallium/drivers/ethosu/ethosu_ml.c
+++ b/src/gallium/drivers/ethosu/ethosu_ml.c
@@ -148,6 +148,7 @@ ethosu_ml_operation_supported(struct pipe_ml_device *pdevice,
    case PIPE_ML_OPERATION_TYPE_LOGISTIC:
    case PIPE_ML_OPERATION_TYPE_TANH:
    case PIPE_ML_OPERATION_TYPE_HSWISH:
+   case PIPE_ML_OPERATION_TYPE_LEAKY_RELU:
       supported = true;
       break;
    case PIPE_ML_OPERATION_TYPE_RESIZE: {