util: implement F16C using inline assembly on x86_64

F16C: https://en.wikipedia.org/wiki/F16C This also happens to fix bptc-float-modes on llvmpipe. Reviewed-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6987>
2026-02-22 03:20:43 +01:00 · 2020-09-18 05:21:09 -04:00 · 2020-09-18 05:21:09 -04:00 · ffcdf76799
commit ffcdf76799
parent 4c54f05915
6 changed files with 84 additions and 19 deletions
--- a/.gitlab-ci/piglit/quick_gl.txt
+++ b/.gitlab-ci/piglit/quick_gl.txt
@ -738,7 +738,6 @@ spec/arb_sparse_buffer/commit: skip
 spec/arb_sparse_buffer/minmax: skip
 spec/arb_tessellation_shader/arb_tessellation_shader-immediate-mode-draw-patches: skip
 spec/arb_texture_buffer_object/negative-unsupported: skip
-spec/arb_texture_compression_bptc/bptc-float-modes: fail
 spec/arb_texture_cube_map/copyteximage cube samples=16: skip
 spec/arb_texture_cube_map/copyteximage cube samples=2: skip
 spec/arb_texture_cube_map/copyteximage cube samples=32: skip
@ -1656,8 +1655,8 @@ wgl/wgl-sanity: skip
 summary:
       name:  results
       ----  --------
-       pass:    23074
-       fail:      198
+       pass:    23075
+       fail:      197
      crash:        0
       skip:     1433
    timeout:        0
--- a/src/gallium/tests/unit/u_half_test.c
+++ b/src/gallium/tests/unit/u_half_test.c
@ -4,9 +4,10 @@

 #include "util/u_math.h"
 #include "util/u_half.h"
+#include "util/u_cpu_detect.h"

-int
-main(int argc, char **argv)
+static void
+test(void)
 {
   unsigned i;
   unsigned roundtrip_fails = 0;
@ -28,9 +29,21 @@ main(int argc, char **argv)

   if(roundtrip_fails) {
      printf("Failure! %u/65536 half floats failed a conversion to float and back.\n", roundtrip_fails);
-      return 1;
-   } else {
-      printf("Success!\n");
-      return 0;
+      exit(1);
   }
 }
+
+int
+main(int argc, char **argv)
+{
+   assert(!util_cpu_caps.has_f16c);
+   test();
+
+   /* Test f16c. */
+   util_cpu_detect();
+   if (util_cpu_caps.has_f16c)
+      test();
+
+   printf("Success!\n");
+   return 0;
+}
--- a/src/util/half_float.c
+++ b/src/util/half_float.c
@ -54,7 +54,7 @@ typedef union { float f; int32_t i; uint32_t u; } fi_type;
 *     result in the same value as if the expression were executed on the GPU.
 */
 uint16_t
-_mesa_float_to_half(float val)
+_mesa_float_to_half_slow(float val)
 {
   const fi_type fi = {val};
   const int flt_m = fi.i & 0x7fffff;
@ -129,9 +129,9 @@ _mesa_float_to_half(float val)
 }

 uint16_t
-_mesa_float_to_float16_rtz(float val)
+_mesa_float_to_float16_rtz_slow(float val)
 {
-    return _mesa_float_to_half_rtz(val);
+    return _mesa_float_to_half_rtz_slow(val);
 }

 /**
@ -140,7 +140,7 @@ _mesa_float_to_float16_rtz(float val)
 * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html
 */
 float
-_mesa_half_to_float(uint16_t val)
+_mesa_half_to_float_slow(uint16_t val)
 {
   return util_half_to_float(val);
 }
--- a/src/util/half_float.h
+++ b/src/util/half_float.h
@ -28,6 +28,12 @@

 #include <stdbool.h>
 #include <stdint.h>
+#include <string.h>
+#include "util/u_cpu_detect.h"
+
+#if defined(USE_X86_64_ASM)
+#include <immintrin.h>
+#endif

 #ifdef __cplusplus
 extern "C" {
@ -36,18 +42,65 @@ extern "C" {
 #define FP16_ONE     ((uint16_t) 0x3c00)
 #define FP16_ZERO    ((uint16_t) 0)

-uint16_t _mesa_float_to_half(float val);
-float _mesa_half_to_float(uint16_t val);
+uint16_t _mesa_float_to_half_slow(float val);
+float _mesa_half_to_float_slow(uint16_t val);
 uint8_t _mesa_half_to_unorm8(uint16_t v);
 uint16_t _mesa_uint16_div_64k_to_half(uint16_t v);

 /*
- * _mesa_float_to_float16_rtz is no more than a wrapper to the counterpart
+ * _mesa_float_to_float16_rtz_slow is no more than a wrapper to the counterpart
 * softfloat.h call. Still, softfloat.h conversion API is meant to be kept
 * private. In other words, only use the API published here, instead of
 * calling directly the softfloat.h one.
 */
-uint16_t _mesa_float_to_float16_rtz(float val);
+uint16_t _mesa_float_to_float16_rtz_slow(float val);
+
+static inline uint16_t
+_mesa_float_to_half(float val)
+{
+#if defined(USE_X86_64_ASM)
+   if (util_cpu_caps.has_f16c) {
+      __m128 in = {val};
+      __m128i out;
+
+      /* $0 = round to nearest */
+      __asm volatile("vcvtps2ph $0, %1, %0" : "=v"(out) : "v"(in));
+      return out[0];
+   }
+#endif
+   return _mesa_float_to_half_slow(val);
+}
+
+static inline float
+_mesa_half_to_float(uint16_t val)
+{
+#if defined(USE_X86_64_ASM)
+   if (util_cpu_caps.has_f16c) {
+      __m128i in = {val};
+      __m128 out;
+
+      __asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));
+      return out[0];
+   }
+#endif
+   return _mesa_half_to_float_slow(val);
+}
+
+static inline uint16_t
+_mesa_float_to_float16_rtz(float val)
+{
+#if defined(USE_X86_64_ASM)
+   if (util_cpu_caps.has_f16c) {
+      __m128 in = {val};
+      __m128i out;
+
+      /* $3 = round towards zero (truncate) */
+      __asm volatile("vcvtps2ph $3, %1, %0" : "=v"(out) : "v"(in));
+      return out[0];
+   }
+#endif
+   return _mesa_float_to_float16_rtz_slow(val);
+}

 static inline uint16_t
 _mesa_float_to_float16_rtne(float val)
--- a/src/util/softfloat.c
+++ b/src/util/softfloat.c
@ -1435,7 +1435,7 @@ _mesa_double_to_f32(double val, bool rtz)
 * From f32_to_f16()
 */
 uint16_t
-_mesa_float_to_half_rtz(float val)
+_mesa_float_to_half_rtz_slow(float val)
 {
    const fi_type fi = {val};
    const uint32_t flt_m = fi.u & 0x7fffff;
--- a/src/util/softfloat.h
+++ b/src/util/softfloat.h
@ -56,7 +56,7 @@ double _mesa_double_mul_rtz(double a, double b);
 double _mesa_double_fma_rtz(double a, double b, double c);
 float _mesa_float_fma_rtz(float a, float b, float c);
 float _mesa_double_to_f32(double x, bool rtz);
-uint16_t _mesa_float_to_half_rtz(float x);
+uint16_t _mesa_float_to_half_rtz_slow(float x);

 #ifdef __cplusplus
 } /* extern C */