llvmpipe: use half-even rounding in lerp

This fixes a biases in texture linear sampling, which can be very noticeable when strong lighting is applied on mipmapped textures generated at runtime using successive linear blitting. More bits aren't actually needed for lerp and the intrinsic rounding is wrong, so it is removed in favour of a correct uniform codegen. Reviewed-by: Jose Fonseca <jose.fonseca@broadcom.com> Reviewed-by: Roland Scheidegger <roland.scheidegger@broadcom.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37986>
2025-12-20 18:10:11 +01:00 · 2025-11-04 15:47:51 -05:00 · 2025-11-04 15:47:51 -05:00 · c200b18e87
commit c200b18e87
parent 34b34c2462
1 changed files with 53 additions and 56 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@ -1171,68 +1171,66 @@ lp_build_lerp_simple(struct lp_build_context *bld,
      return lp_build_mad(bld, x, delta, v0);
   }
-   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
      if (!bld->type.sign) {
      if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
         /*
          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
          * most-significant-bit to the lowest-significant-bit, so that
          * later we can just divide by 2**n instead of 2**n - 1.
          */
         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
      }
         /* (x * delta) >> n */
      /*
-          * For this multiply, higher internal precision is required to pass
+       * To have correct rounding, we must implement (example for 8 bits):
-          * CTS, the most efficient path to that is pmulhrsw on ssse3 and
+       *   uint16_t lerp_round_half_even(uint16_t x, uint16_t v0, uint16_t v1)
-          * above.  This could be opencoded on other arches if conformance was
+       *   {
-          * required.
+       *      uint16_t delta = v1 - v0;
       *      uint16_t m = x * delta;
       *      uint16_t is_odd = (m & 0x100) >> 8;
       *      m += 0x7F + is_odd; // + 0.5 for odd, + ~0.498 for even
       *      m >>= 8;
       *      return (uint8_t)v0 + (uint8_t)m;
       *   }
       */
         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
         } else {
      res = lp_build_mul(bld, x, delta);
      LLVMValueRef is_odd = lp_build_shr_imm(bld,lp_build_and(bld, res,
         lp_build_const_int_vec(bld->gallivm, bld->type, 1ll << half_width)), half_width);
      res = lp_build_add(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << (half_width - 1)) - 1));
      res = lp_build_add(bld, res, is_odd);
      res = lp_build_shr_imm(bld, res, half_width);
         }
      } else {
         /*
          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
          * most-significant-bit to the lowest-significant-bit, so that
          * later we can just divide by 2**n instead of 2**n - 1.
          */
         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
      }
   } else {
      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
      res = lp_build_mul(bld, x, delta);
   }
   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
      /*
       * At this point both res and v0 only use the lower half of the bits,
       * the rest is zero. Instead of add / mask, do add with half wide type.
       */
      struct lp_type narrow_type;
      struct lp_build_context narrow_bld;
      memset(&narrow_type, 0, sizeof narrow_type);
      narrow_type.sign   = bld->type.sign;
      narrow_type.width  = bld->type.width/2;
      narrow_type.length = bld->type.length*2;
-
+      struct lp_build_context narrow_bld;
      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
      res = lp_build_add(&narrow_bld, v0, res);
      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
      return res;
   }
   assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
      /*
       * The rescaling trick above doesn't work for signed numbers, so
       * use the 2**n - 1 divison approximation in lp_build_mul_norm
       * instead.
       */
      res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
   } else {
      res = lp_build_mul(bld, x, delta);
   }
   res = lp_build_add(bld, v0, res);
   if (bld->type.fixed) {
@ -1246,10 +1244,9 @@ lp_build_lerp_simple(struct lp_build_context *bld,
       * distinguishing the values interpretation from the value storage.
       */
      LLVMValueRef low_bits;
-         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
+      low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1ll << half_width) - 1);
      res = LLVMBuildAnd(builder, res, low_bits, "");
   }
   }
   return res;
 }