diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index e84d9361ada..b5aea1ab4ed 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1171,84 +1171,81 @@ lp_build_lerp_simple(struct lp_build_context *bld,
       return lp_build_mad(bld, x, delta, v0);
    }
 
-   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
-      if (!bld->type.sign) {
-         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
-            /*
-             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
-             * most-significant-bit to the lowest-significant-bit, so that
-             * later we can just divide by 2**n instead of 2**n - 1.
-             */
-
-            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
-         }
-
-         /* (x * delta) >> n */
-         /*
-          * For this multiply, higher internal precision is required to pass
-          * CTS, the most efficient path to that is pmulhrsw on ssse3 and
-          * above.  This could be opencoded on other arches if conformance was
-          * required.
-          */
-         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else {
-            res = lp_build_mul(bld, x, delta);
-            res = lp_build_shr_imm(bld, res, half_width);
-         }
-      } else {
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
          /*
           * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
           * most-significant-bit to the lowest-significant-bit, so that
           * later we can just divide by 2**n instead of 2**n - 1.
           */
-         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
       }
-   } else {
-      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-      res = lp_build_mul(bld, x, delta);
-   }
 
-   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      /*
+       * To have correct rounding, we must implement (example for 8 bits):
+       *   uint16_t lerp_round_half_even(uint16_t x, uint16_t v0, uint16_t v1)
+       *   {
+       *      uint16_t delta = v1 - v0;
+       *      uint16_t m = x * delta;
+       *      uint16_t is_odd = (m & 0x100) >> 8;
+       *      m += 0x7F + is_odd; // + 0.5 for odd, + ~0.498 for even
+       *      m >>= 8;
+       *      return (uint8_t)v0 + (uint8_t)m;
+       *   }
+       */
+      res = lp_build_mul(bld, x, delta);
+      LLVMValueRef is_odd = lp_build_shr_imm(bld,lp_build_and(bld, res,
+         lp_build_const_int_vec(bld->gallivm, bld->type, 1ll << half_width)), half_width);
+      res = lp_build_add(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << (half_width - 1)) - 1));
+      res = lp_build_add(bld, res, is_odd);
+      res = lp_build_shr_imm(bld, res, half_width);
+
       /*
        * At this point both res and v0 only use the lower half of the bits,
        * the rest is zero. Instead of add / mask, do add with half wide type.
        */
       struct lp_type narrow_type;
-      struct lp_build_context narrow_bld;
-
       memset(&narrow_type, 0, sizeof narrow_type);
       narrow_type.sign   = bld->type.sign;
       narrow_type.width  = bld->type.width/2;
       narrow_type.length = bld->type.length*2;
-
+      struct lp_build_context narrow_bld;
       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
+
       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
       res = lp_build_add(&narrow_bld, v0, res);
       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
-   } else {
-      res = lp_build_add(bld, v0, res);
+      return res;
+   }
 
-      if (bld->type.fixed) {
-         /*
-          * We need to mask out the high order bits when lerping 8bit
-          * normalized colors stored on 16bits
-          *
-          * XXX: This step is necessary for lerping 8bit colors stored on
-          * 16bits, but it will be wrong for true fixed point use cases.
-          * Basically we need a more powerful lp_type, capable of further
-          * distinguishing the values interpretation from the value storage.
-          */
-         LLVMValueRef low_bits;
-         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
-         res = LLVMBuildAnd(builder, res, low_bits, "");
-      }
+   assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
+   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
+      /*
+       * The rescaling trick above doesn't work for signed numbers, so
+       * use the 2**n - 1 divison approximation in lp_build_mul_norm
+       * instead.
+       */
+      res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+   } else {
+      res = lp_build_mul(bld, x, delta);
+   }
+
+   res = lp_build_add(bld, v0, res);
+
+   if (bld->type.fixed) {
+      /*
+       * We need to mask out the high order bits when lerping 8bit
+       * normalized colors stored on 16bits
+       *
+       * XXX: This step is necessary for lerping 8bit colors stored on
+       * 16bits, but it will be wrong for true fixed point use cases.
+       * Basically we need a more powerful lp_type, capable of further
+       * distinguishing the values interpretation from the value storage.
+       */
+      LLVMValueRef low_bits;
+      low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << half_width) - 1);
+      res = LLVMBuildAnd(builder, res, low_bits, "");
    }
 
    return res;