llvmpipe: use half-even rounding in lerp

This fixes a biases in texture linear sampling, which can be very noticeable when strong lighting is applied on mipmapped textures generated at runtime using successive linear blitting. More bits aren't actually needed for lerp and the intrinsic rounding is wrong, so it is removed in favour of a correct uniform codegen. Reviewed-by: Jose Fonseca <jose.fonseca@broadcom.com> Reviewed-by: Roland Scheidegger <roland.scheidegger@broadcom.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37986>
2025-12-20 16:00:08 +01:00 · 2025-11-04 15:47:51 -05:00 · 2025-11-04 15:47:51 -05:00 · c200b18e87
commit c200b18e87
parent 34b34c2462
1 changed files with 53 additions and 56 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@ -1171,84 +1171,81 @@ lp_build_lerp_simple(struct lp_build_context *bld,
      return lp_build_mad(bld, x, delta, v0);
   }

-   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
-      if (!bld->type.sign) {
-         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
-            /*
-             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
-             * most-significant-bit to the lowest-significant-bit, so that
-             * later we can just divide by 2**n instead of 2**n - 1.
-             */
-
-            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
-         }
-
-         /* (x * delta) >> n */
-         /*
-          * For this multiply, higher internal precision is required to pass
-          * CTS, the most efficient path to that is pmulhrsw on ssse3 and
-          * above.  This could be opencoded on other arches if conformance was
-          * required.
-          */
-         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
-            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
-         } else {
-            res = lp_build_mul(bld, x, delta);
-            res = lp_build_shr_imm(bld, res, half_width);
-         }
-      } else {
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
         /*
          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
          * most-significant-bit to the lowest-significant-bit, so that
          * later we can just divide by 2**n instead of 2**n - 1.
          */
-         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
      }
-   } else {
-      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
-      res = lp_build_mul(bld, x, delta);
-   }

-   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      /*
+       * To have correct rounding, we must implement (example for 8 bits):
+       *   uint16_t lerp_round_half_even(uint16_t x, uint16_t v0, uint16_t v1)
+       *   {
+       *      uint16_t delta = v1 - v0;
+       *      uint16_t m = x * delta;
+       *      uint16_t is_odd = (m & 0x100) >> 8;
+       *      m += 0x7F + is_odd; // + 0.5 for odd, + ~0.498 for even
+       *      m >>= 8;
+       *      return (uint8_t)v0 + (uint8_t)m;
+       *   }
+       */
+      res = lp_build_mul(bld, x, delta);
+      LLVMValueRef is_odd = lp_build_shr_imm(bld,lp_build_and(bld, res,
+         lp_build_const_int_vec(bld->gallivm, bld->type, 1ll << half_width)), half_width);
+      res = lp_build_add(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << (half_width - 1)) - 1));
+      res = lp_build_add(bld, res, is_odd);
+      res = lp_build_shr_imm(bld, res, half_width);
+
      /*
       * At this point both res and v0 only use the lower half of the bits,
       * the rest is zero. Instead of add / mask, do add with half wide type.
       */
      struct lp_type narrow_type;
-      struct lp_build_context narrow_bld;
-
      memset(&narrow_type, 0, sizeof narrow_type);
      narrow_type.sign   = bld->type.sign;
      narrow_type.width  = bld->type.width/2;
      narrow_type.length = bld->type.length*2;
-
+      struct lp_build_context narrow_bld;
      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
+
      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
      res = lp_build_add(&narrow_bld, v0, res);
      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
-   } else {
-      res = lp_build_add(bld, v0, res);
+      return res;
+   }

-      if (bld->type.fixed) {
-         /*
-          * We need to mask out the high order bits when lerping 8bit
-          * normalized colors stored on 16bits
-          *
-          * XXX: This step is necessary for lerping 8bit colors stored on
-          * 16bits, but it will be wrong for true fixed point use cases.
-          * Basically we need a more powerful lp_type, capable of further
-          * distinguishing the values interpretation from the value storage.
-          */
-         LLVMValueRef low_bits;
-         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
-         res = LLVMBuildAnd(builder, res, low_bits, "");
-      }
+   assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
+   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
+      /*
+       * The rescaling trick above doesn't work for signed numbers, so
+       * use the 2**n - 1 divison approximation in lp_build_mul_norm
+       * instead.
+       */
+      res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
+   } else {
+      res = lp_build_mul(bld, x, delta);
+   }
+
+   res = lp_build_add(bld, v0, res);
+
+   if (bld->type.fixed) {
+      /*
+       * We need to mask out the high order bits when lerping 8bit
+       * normalized colors stored on 16bits
+       *
+       * XXX: This step is necessary for lerping 8bit colors stored on
+       * 16bits, but it will be wrong for true fixed point use cases.
+       * Basically we need a more powerful lp_type, capable of further
+       * distinguishing the values interpretation from the value storage.
+       */
+      LLVMValueRef low_bits;
+      low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << half_width) - 1);
+      res = LLVMBuildAnd(builder, res, low_bits, "");
   }

   return res;