From c200b18e876468b51fe80d9660f612dc03a5138e Mon Sep 17 00:00:00 2001
From: Aleksi Sapon <aleksi.sapon@autodesk.com>
Date: Tue, 4 Nov 2025 15:47:51 -0500
Subject: [PATCH] llvmpipe: use half-even rounding in lerp

This fixes a biases in texture linear sampling,
which can be very noticeable when strong lighting
is applied on mipmapped textures generated at
runtime using successive linear blitting.

More bits aren't actually needed for lerp and the
intrinsic rounding is wrong, so it is removed in
favour of a correct uniform codegen.

Reviewed-by: Jose Fonseca <jose.fonseca@broadcom.com>
Reviewed-by: Roland Scheidegger <roland.scheidegger@broadcom.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37986>
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 109 ++++++++++----------
 1 file changed, 53 insertions(+), 56 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index e84d9361ada..b5aea1ab4ed 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1171,84 +1171,81 @@ lp_build_lerp_simple(struct lp_build_context *bld,
       return lp_build_mad(bld, x, delta, v0);
    }
 
-   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
-      if (!bld->type.sign) {
-         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
-            /*
-             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
-             * most-significant-bit to the lowest-significant-bit, so that
-             * later we can just divide by 2**n instead of 2**n - 1.
-             */
-
-            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
-         }
-
-         /* (x * delta) >> n */
-         /*
-          * For this multiply, higher internal precision is required to pass
-          * CTS, the most efficient path to that is pmulhrsw on ssse3 and
-          * above.  This could be opencoded on other arches if conformance was
-          * required.
-          */
-         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else {
-            res = lp_build_mul(bld, x, delta);
-            res = lp_build_shr_imm(bld, res, half_width);
-         }
-      } else {
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
          /*
           * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
           * most-significant-bit to the lowest-significant-bit, so that
           * later we can just divide by 2**n instead of 2**n - 1.
           */
-         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
       }
-   } else {
-      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-      res = lp_build_mul(bld, x, delta);
-   }
 
-   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      /*
+       * To have correct rounding, we must implement (example for 8 bits):
+       *   uint16_t lerp_round_half_even(uint16_t x, uint16_t v0, uint16_t v1)
+       *   {
+       *      uint16_t delta = v1 - v0;
+       *      uint16_t m = x * delta;
+       *      uint16_t is_odd = (m & 0x100) >> 8;
+       *      m += 0x7F + is_odd; // + 0.5 for odd, + ~0.498 for even
+       *      m >>= 8;
+       *      return (uint8_t)v0 + (uint8_t)m;
+       *   }
+       */
+      res = lp_build_mul(bld, x, delta);
+      LLVMValueRef is_odd = lp_build_shr_imm(bld,lp_build_and(bld, res,
+         lp_build_const_int_vec(bld->gallivm, bld->type, 1ll << half_width)), half_width);
+      res = lp_build_add(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << (half_width - 1)) - 1));
+      res = lp_build_add(bld, res, is_odd);
+      res = lp_build_shr_imm(bld, res, half_width);
+
       /*
        * At this point both res and v0 only use the lower half of the bits,
        * the rest is zero. Instead of add / mask, do add with half wide type.
        */
       struct lp_type narrow_type;
-      struct lp_build_context narrow_bld;
-
       memset(&narrow_type, 0, sizeof narrow_type);
       narrow_type.sign   = bld->type.sign;
       narrow_type.width  = bld->type.width/2;
       narrow_type.length = bld->type.length*2;
-
+      struct lp_build_context narrow_bld;
       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
+
       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
       res = lp_build_add(&narrow_bld, v0, res);
       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
-   } else {
-      res = lp_build_add(bld, v0, res);
+      return res;
+   }
 
-      if (bld->type.fixed) {
-         /*
-          * We need to mask out the high order bits when lerping 8bit
-          * normalized colors stored on 16bits
-          *
-          * XXX: This step is necessary for lerping 8bit colors stored on
-          * 16bits, but it will be wrong for true fixed point use cases.
-          * Basically we need a more powerful lp_type, capable of further
-          * distinguishing the values interpretation from the value storage.
-          */
-         LLVMValueRef low_bits;
-         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
-         res = LLVMBuildAnd(builder, res, low_bits, "");
-      }
+   assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
+   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
+      /*
+       * The rescaling trick above doesn't work for signed numbers, so
+       * use the 2**n - 1 divison approximation in lp_build_mul_norm
+       * instead.
+       */
+      res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+   } else {
+      res = lp_build_mul(bld, x, delta);
+   }
+
+   res = lp_build_add(bld, v0, res);
+
+   if (bld->type.fixed) {
+      /*
+       * We need to mask out the high order bits when lerping 8bit
+       * normalized colors stored on 16bits
+       *
+       * XXX: This step is necessary for lerping 8bit colors stored on
+       * 16bits, but it will be wrong for true fixed point use cases.
+       * Basically we need a more powerful lp_type, capable of further
+       * distinguishing the values interpretation from the value storage.
+       */
+      LLVMValueRef low_bits;
+      low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << half_width) - 1);
+      res = LLVMBuildAnd(builder, res, low_bits, "");
    }
 
    return res;