llvmpipe: fix blending with half-float formats

The fact that we flush denorms to zero breaks our half-float conversion and blending. This patches enables denorms for blending. It's a little tricky due to the llvm bug that makes it incorrectly reorder the mxcsr intrinsics: http://llvm.org/bugs/show_bug.cgi?id=6393 Signed-off-by: Zack Rusin <zackr@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> Signed-off-by: Zack Rusin <zackr@vmware.com>
2026-02-15 13:10:31 +01:00 · 2013-12-06 01:28:25 -05:00 · 2013-12-06 01:28:25 -05:00 · 155139059b
commit 155139059b
parent 1e71493afa
3 changed files with 108 additions and 5 deletions
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@ -64,6 +64,17 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_flow.h"

+#if defined(PIPE_ARCH_SSE)
+#include <xmmintrin.h>
+#endif
+
+#ifndef _MM_DENORMALS_ZERO_MASK
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#endif
+
+#ifndef _MM_FLUSH_ZERO_MASK
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#endif

 #define EXP_POLY_DEGREE 5

@ -3489,3 +3500,63 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
   return ret;
 }

+
+LLVMValueRef
+lp_build_fpstate_get(struct gallivm_state *gallivm)
+{
+   if (util_cpu_caps.has_sse) {
+      LLVMBuilderRef builder = gallivm->builder;
+      LLVMValueRef mxcsr_ptr = lp_build_alloca(
+         gallivm,
+         LLVMInt32TypeInContext(gallivm->context),
+         "mxcsr_ptr");
+      lp_build_intrinsic(builder,
+                         "llvm.x86.sse.stmxcsr",
+                         LLVMVoidTypeInContext(gallivm->context),
+                         &mxcsr_ptr, 1);
+      return mxcsr_ptr;
+   }
+   return 0;
+}
+
+void
+lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
+                                  boolean zero)
+{
+   if (util_cpu_caps.has_sse) {
+      /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
+      int daz_ftz = _MM_FLUSH_ZERO_MASK;
+
+      LLVMBuilderRef builder = gallivm->builder;
+      LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
+      LLVMValueRef mxcsr =
+         LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
+
+      if (util_cpu_caps.has_daz) {
+         /* Enable denormals are zero mode */
+         daz_ftz |= _MM_DENORMALS_ZERO_MASK;
+      }
+      if (zero) {
+         mxcsr = LLVMBuildOr(builder, mxcsr,
+                             LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
+      } else {
+         mxcsr = LLVMBuildAnd(builder, mxcsr,
+                              LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
+      }
+
+      LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
+      lp_build_fpstate_set(gallivm, mxcsr_ptr);
+   }
+}
+
+void
+lp_build_fpstate_set(struct gallivm_state *gallivm,
+                     LLVMValueRef mxcsr_ptr)
+{
+   if (util_cpu_caps.has_sse) {
+      lp_build_intrinsic(gallivm->builder,
+                         "llvm.x86.sse.ldmxcsr",
+                         LLVMVoidTypeInContext(gallivm->context),
+                         &mxcsr_ptr, 1);
+   }
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@ -358,4 +358,15 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
                       const struct lp_type type,
                       LLVMValueRef x);

+
+LLVMValueRef
+lp_build_fpstate_get(struct gallivm_state *gallivm);
+
+void
+lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
+                                  boolean zero);
+void
+lp_build_fpstate_set(struct gallivm_state *gallivm,
+                     LLVMValueRef mxcsr);
+
 #endif /* !LP_BLD_ARIT_H */
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@ -1554,6 +1554,28 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,

   const boolean is_1d = variant->key.resource_1d;
   unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
+   LLVMValueRef fpstate = 0;
+
+   /* Get type from output format */
+   lp_blend_type_from_format_desc(out_format_desc, &row_type);
+   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+   /*
+    * Technically this code should go into lp_build_smallfloat_to_float
+    * and lp_build_float_to_smallfloat but due to the
+    * http://llvm.org/bugs/show_bug.cgi?id=6393
+    * llvm reorders the mxcsr intrinsics in a way that breaks the code.
+    * So the ordering is important here and there shouldn't be any
+    * llvm ir instrunctions in this function before
+    * this, otherwise half-float format conversions won't work
+    * (again due to llvm bug #6393).
+    */
+   if (dst_type.floating && dst_type.width != 32) {
+      /* We need to make sure that denorms are ok for half float
+         conversions */
+      fpstate = lp_build_fpstate_get(gallivm);
+      lp_build_fpstate_set_denorms_zero(gallivm, FALSE);
+   }

   mask_type = lp_int32_vec4_type();
   mask_type.length = fs_type.length;
@ -1587,11 +1609,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
   undef_src_val = lp_build_undef(gallivm, fs_type);
 #endif

-
-   /* Get type from output format */
-   lp_blend_type_from_format_desc(out_format_desc, &row_type);
-   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
-
   row_type.length = fs_type.length;
   vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;

@ -2051,6 +2068,10 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                             dst, dst_type, dst_count, dst_alignment);
   }

+   if (dst_type.floating && dst_type.width != 32) {
+      lp_build_fpstate_set(gallivm, fpstate);
+   }
+
   if (do_branch) {
      lp_build_mask_end(&mask_ctx);
   }