llvmpipe: don't rely on cpu denorms for float to smallfloat conversion

Similar to what we already do for smallfloats to floats, handle denorms and normals separately with bit manipulation stuff rather than rely on a rescale mul which depends on cpu denorms. This is a bit more complex, but on the upside we don't need to track fpstate for denorms anymore in llvmpipe backend. (With modern x86 cpus this is essentially only really relevant for r11g11b10 float format, since f16 formats are using f16c instructions.) Reviewed-by: Brian Paul <brian.paul@broadcom.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40787>
2026-05-07 02:48:06 +02:00 · 2026-04-08 18:40:48 +02:00 · 2026-04-08 18:40:48 +02:00 · c20106f1e4
commit c20106f1e4
parent 2ba62d1502
2 changed files with 47 additions and 36 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
@ -113,23 +113,56 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm,
   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
                                          ~((1 << (23 - mantissa_bits)) - 1) &
                                          0x7fffffff);
-   rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
   rescale_src = lp_build_and(&i32_bld, rescale_src, i32_roundmask);
-   rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, "");

-   /* bias exponent (and denormalize if necessary) */
-   magic = lp_build_const_int_vec(gallivm, i32_type,
-                                  ((1 << (exponent_bits - 1)) - 1) << 23);
-   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
-   normal = lp_build_mul(&f32_bld, rescale_src, magic);
-
-   /* clamp to max value - largest non-infinity number */
+   /* largest non-infinity number */
   small_max = lp_build_const_int_vec(gallivm, i32_type,
-                                      (((1 << exponent_bits) - 2) << 23) |
+                                      ((127 + ((1 << (exponent_bits - 1)) - 1)) << 23) |
                                      (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
-   small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
-   normal = lp_build_min(&f32_bld, normal, small_max);
-   normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
+
+   /*
+    * This code only works correctly if denormals are enabled if the smallfloat
+    * result is a denormal, otherwise result is flushed to zero.
+    */
+   if (0) {
+      /* clamp to max value */
+      rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, "");
+      small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
+      normal = lp_build_min(&f32_bld, rescale_src, small_max);
+      /* bias exponent (and denormalize if necessary) */
+      magic = lp_build_const_int_vec(gallivm, i32_type,
+                                     ((1 << (exponent_bits - 1)) - 1) << 23);
+      magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
+      normal = lp_build_mul(&f32_bld, normal, magic);
+      normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
+   }
+   else {
+      LLVMValueRef exp_adj, denorm_scale, is_denorm_or_zero, denorm;
+      /* clamp to max value */
+      normal = lp_build_min(&i32_bld, rescale_src, small_max);
+      exp_adj = lp_build_const_int_vec(gallivm, i32_type,
+                                       (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
+      normal = lp_build_sub(&i32_bld, normal, exp_adj);
+      LLVMValueRef mantissa_mask = lp_build_not(&i32_bld,
+                                                lp_build_const_int_vec(gallivm, i32_type, 0xFF800000));
+      is_denorm_or_zero = lp_build_cmp(&i32_bld, PIPE_FUNC_LEQUAL, normal, mantissa_mask);
+      /*
+       * for smallfloat denormals, do magic scaling so the mantissa bits can
+       * be extracted directly. denorm_scale is just the smallest normal number.
+       */
+      denorm_scale = lp_build_const_int_vec(gallivm, i32_type,
+                                            (127 - ((1 << (exponent_bits - 1)) - 2)) << 23);
+      denorm_scale = LLVMBuildBitCast(builder, denorm_scale, f32_bld.vec_type, "");
+      rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, "");
+      /*
+       * this magic add will shift out the mantissa bits to the right position (if
+       * the number is actually smaller than denorm_scale).
+       */
+      denorm = lp_build_add(&f32_bld, rescale_src, denorm_scale);
+      denorm = LLVMBuildBitCast(builder, denorm, i32_bld.vec_type, "");
+      denorm = lp_build_and(&i32_bld, denorm, mantissa_mask);
+      normal = lp_build_select(&i32_bld, is_denorm_or_zero, denorm, normal);
+   }

   /*
    * handle nan/inf cases
@ -344,7 +377,7 @@ lp_build_smallfloat_to_float(struct gallivm_state *gallivm,

      /* for normals, Infs, Nans fix up exponent */
      exp_adj = lp_build_const_int_vec(gallivm, i32_type,
-                                      (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
+                                       (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
      normal = lp_build_add(&i32_bld, srcabs, exp_adj);
      tmp = lp_build_and(&i32_bld, wasinfnan, i32_floatexpmask);
      normal = lp_build_or(&i32_bld, tmp, normal);
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@ -2481,7 +2481,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,

   const bool is_1d = variant->key.resource_1d;
   const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
-   LLVMValueRef fpstate = NULL;

   LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);

@ -2490,23 +2489,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
   lp_blend_type_from_format_desc(out_format_desc, &row_type);
   lp_mem_type_from_format_desc(out_format_desc, &dst_type);

-   /*
-    * Technically this code should go into lp_build_smallfloat_to_float
-    * and lp_build_float_to_smallfloat but due to the
-    * http://llvm.org/bugs/show_bug.cgi?id=6393
-    * llvm reorders the mxcsr intrinsics in a way that breaks the code.
-    * So the ordering is important here and there shouldn't be any
-    * llvm ir instrunctions in this function before
-    * this, otherwise half-float format conversions won't work
-    * (again due to llvm bug #6393).
-    */
-   if (have_smallfloat_format(dst_type, out_format)) {
-      /* We need to make sure that denorms are ok for half float
-         conversions */
-      fpstate = lp_build_fpstate_get(gallivm);
-      lp_build_fpstate_set_denorms_zero(gallivm, false);
-   }
-
   struct lp_type mask_type = lp_int32_vec4_type();
   mask_type.length = fs_type.length;

@ -3129,10 +3111,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
   if (do_branch) {
      lp_build_mask_end(&mask_ctx);
   }
-
-   if (fpstate) {
-      lp_build_fpstate_set(gallivm, fpstate);
-   }
 }