From 7c1ec5642781a37e9cda6d8de6dc2345be36ef85 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Mon, 19 Jan 2026 11:49:16 +0100
Subject: [PATCH] ethosu: clean up ADD elementwise scaling

Replace the two functions simplified_elementwise_add_sub_scale and
eltwise_emit_ofm_scaling with a single advanced_elementwise_add_sub_scale
that follows the ethos-u-vela naming. Remove the large block of
commented out Vela Python code.

No functional change.

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39594>
---
 src/gallium/drivers/ethosu/ethosu_cmd.c | 160 ++++++------------------
 1 file changed, 36 insertions(+), 124 deletions(-)

diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.c b/src/gallium/drivers/ethosu/ethosu_cmd.c
index eda454cdadc..caab9994921 100644
--- a/src/gallium/drivers/ethosu/ethosu_cmd.c
+++ b/src/gallium/drivers/ethosu/ethosu_cmd.c
@@ -381,150 +381,62 @@ emit_ifm2_broadcast(struct ethosu_subgraph *subgraph, struct ethosu_operation *o
 }
 
 /*
-def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
-        input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
-        input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
-        output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
-
-        if npu_op.activation is not None and npu_op.activation.op_type in (
-            NpuActivationOp.SIGMOID,
-            NpuActivationOp.TANH,
-        ):
-            output_scale = 1 / 0x3000
-
-        if npu_op.sub_op_type == NpuElementWiseOp.MUL:
-            if npu_op.rescale:
-                ofm_scale, shift = npu_op.rescale
-            elif None in (input_scale, input2_scale, output_scale):
-                ofm_scale = 1
-                shift = 0
-            else:
-                ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
-        else:  # Add/Sub
-            # Default operand scaling is no scaling
-            opa_scale = opb_scale = 1
-            opa_shift = 0
-            bitdepth = npu_op.ifm.data_type.size_in_bits()
-            use_advanced_scaling = False
-            if npu_op.rescale is not None:
-                # Explicit ofm scaling
-                ofm_scale, shift = npu_op.rescale
-            elif None in (input_scale, input2_scale, output_scale):
-                # No ofm scaling
-                ofm_scale = 1
-                shift = 0
-            elif input_scale == input2_scale and bitdepth == 16:
-                # int16 same scaling
-                opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
-                    input_scale, input2_scale, output_scale
-                )
-                # align the double rounding with that of advanced scaling
-                opa_scale //= 2
-                opb_scale //= 2
-                shift -= 1
-                opa_shift = 0  # Unused for this case
-            elif input_scale == input2_scale:
-                # Same scaling
-                opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
-                    input_scale, input2_scale, output_scale
-                )
-                opa_shift = 0  # Unused for this case
-                # For 8 bit we can't guarantee double rounding with simplified scaling will always be
-                # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
-                # the following we know that double rounding will have no effect for advanced scaling
-                # no matter the input, so we can safely use simplified scaling with double rounding disabled.
-                use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
-            else:
-                use_advanced_scaling = True
-            if use_advanced_scaling:
-                # Use advanced implementation only when input/output scales differ,
-                # or when we can't guarantee the absence of rounding errors
-                (
-                    opa_scale,
-                    opa_shift,
-                    ofm_scale,
-                    shift,
-                    op_to_scale,
-                ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
-                opb_scale = 0  # Unused for this case
-                if npu_op.reversed_operands:
-                    # If the operand order is reversed we also have to swap which operand is scaled
-                    if op_to_scale == scaling.OperandToScale.OPa:
-                        op_to_scale = scaling.OperandToScale.OPb
-                    else:
-                        op_to_scale = scaling.OperandToScale.OPa
-            emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
-            emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
-*/
-
-static void
-simplified_elementwise_add_sub_scale(
+ * Elementwise ADD/SUB scaling from Vela advanced_elementwise_add_sub_scale().
+ * Scale up the operand with smaller scale to match the larger one.
+ * OPA_SCALE has the input scaling, OPB_SCALE is 0.
+ * op_to_scale in IFM_PRECISION tells NPU which operand to scale.
+ */
+static enum ethosu_op_to_scale
+advanced_elementwise_add_sub_scale(
+   struct ethosu_subgraph *subgraph,
    double input1_scale,
    double input2_scale,
-   double output_scale,
-   uint32_t input_shift,
-   double *out_input1_rescale,
-   double *out_input2_rescale,
-   uint32_t *out_out_scale,
-   uint32_t *out_out_shift)
+   double output_scale)
 {
    double max_input_scale = MAX2(input1_scale, input2_scale);
-   double input_shift_val = (double)(1LL << input_shift); /* Use 1LL for large shifts */
-
-   *out_input1_rescale = input1_scale * input_shift_val / (2.0 * max_input_scale);
-   *out_input2_rescale = input2_scale * input_shift_val / (2.0 * max_input_scale);
-
-   /*
-    * Be careful with division by zero or very small output_scale if output_scale
-    * can be zero or close to zero.
-    */
-   double output_rescale_val;
-   if (output_scale == 0.0) {
-      /* Handle error or return specific value */
-      output_rescale_val = 0.0; /* Or INFINITY, depending on desired behavior */
-   } else {
-      output_rescale_val = (2.0 * max_input_scale) / (output_scale * input_shift_val);
-   }
-
-   *out_out_scale = ethosu_quantize_scale(output_rescale_val, out_out_shift);
-}
-
-static enum ethosu_op_to_scale
-eltwise_emit_ofm_scaling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
-{
-   double max_input_scale = MAX2(operation->ifm.scale, operation->ifm2.scale);
-   double min_input_scale = MIN2(operation->ifm.scale, operation->ifm2.scale);
+   double min_input_scale = MIN2(input1_scale, input2_scale);
    unsigned bitdepth = 8;
    uint32_t input_shift = (bitdepth == 8) ? 20 : 15;
-   double input1_rescale_tmp;
-   double input2_rescale_tmp;
-   unsigned ofm_scale, ofm_shift;
-   unsigned opa_scale, opa_shift;
+   double input_shift_val = (double)(1ULL << input_shift);
+   enum ethosu_op_to_scale op_to_scale;
+   uint32_t opa_scale, opa_shift;
+   uint32_t ofm_scale, ofm_shift;
+   double input_rescale, output_rescale;
 
-   simplified_elementwise_add_sub_scale(
-      min_input_scale, max_input_scale, operation->ofm.scale, input_shift,
-      &input1_rescale_tmp, &input2_rescale_tmp,
-      &ofm_scale, &ofm_shift);
+   /* Scale the operand with smaller scale */
+   if (input1_scale < input2_scale)
+      op_to_scale = OP_A;
+   else
+      op_to_scale = OP_B;
 
-   opa_scale = ethosu_quantize_scale(input1_rescale_tmp, &opa_shift);
+   /* From Vela simplified_elementwise_add_sub_scale:
+    * input1_rescale = input1_scale * (1 << input_shift) / (2 * max_input_scale)
+    * output_rescale = (2 * max_input_scale) / (output_scale * (1 << input_shift))
+    */
+   input_rescale = min_input_scale * input_shift_val / (2.0 * max_input_scale);
+   output_rescale = (2.0 * max_input_scale) / (output_scale * input_shift_val);
+
+   opa_scale = ethosu_quantize_scale(input_rescale, &opa_shift);
+   ofm_scale = ethosu_quantize_scale(output_rescale, &ofm_shift);
 
    EMIT1(NPU_SET_OPA_SCALE, opa_shift, opa_scale);
    EMIT1(NPU_SET_OPB_SCALE, 0x0, 0x0);
    EMIT1(NPU_SET_OFM_SCALE, ofm_shift, ofm_scale);
 
-   if (operation->ifm.scale < operation->ifm2.scale)
-      return OP_A;
-   else
-      return OP_B;
+   return op_to_scale;
 }
 
 static void
 emit_eltwise(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
 {
    bool has_scalar = false;
-   enum ethosu_op_to_scale op_to_scale = OP_NONE;
+   enum ethosu_op_to_scale op_to_scale;
 
-   op_to_scale = eltwise_emit_ofm_scaling(subgraph, operation);
+   op_to_scale = advanced_elementwise_add_sub_scale(
+      subgraph,
+      operation->ifm.scale,
+      operation->ifm2.scale,
+      operation->ofm.scale);
 
    emit_common(subgraph, operation, op_to_scale);