From ffcdf76799b0b23726d45f97502e2b9826ec628e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 18 Sep 2020 05:21:09 -0400 Subject: [PATCH] util: implement F16C using inline assembly on x86_64 F16C: https://en.wikipedia.org/wiki/F16C This also happens to fix bptc-float-modes on llvmpipe. Reviewed-by: Matt Turner Part-of: --- .gitlab-ci/piglit/quick_gl.txt | 5 +-- src/gallium/tests/unit/u_half_test.c | 25 +++++++++--- src/util/half_float.c | 8 ++-- src/util/half_float.h | 61 ++++++++++++++++++++++++++-- src/util/softfloat.c | 2 +- src/util/softfloat.h | 2 +- 6 files changed, 84 insertions(+), 19 deletions(-) diff --git a/.gitlab-ci/piglit/quick_gl.txt b/.gitlab-ci/piglit/quick_gl.txt index a24ae0d3647..104d2f6cd01 100644 --- a/.gitlab-ci/piglit/quick_gl.txt +++ b/.gitlab-ci/piglit/quick_gl.txt @@ -738,7 +738,6 @@ spec/arb_sparse_buffer/commit: skip spec/arb_sparse_buffer/minmax: skip spec/arb_tessellation_shader/arb_tessellation_shader-immediate-mode-draw-patches: skip spec/arb_texture_buffer_object/negative-unsupported: skip -spec/arb_texture_compression_bptc/bptc-float-modes: fail spec/arb_texture_cube_map/copyteximage cube samples=16: skip spec/arb_texture_cube_map/copyteximage cube samples=2: skip spec/arb_texture_cube_map/copyteximage cube samples=32: skip @@ -1656,8 +1655,8 @@ wgl/wgl-sanity: skip summary: name: results ---- -------- - pass: 23074 - fail: 198 + pass: 23075 + fail: 197 crash: 0 skip: 1433 timeout: 0 diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c index 48a9a2d539c..fb4ce6ec9f2 100644 --- a/src/gallium/tests/unit/u_half_test.c +++ b/src/gallium/tests/unit/u_half_test.c @@ -4,9 +4,10 @@ #include "util/u_math.h" #include "util/u_half.h" +#include "util/u_cpu_detect.h" -int -main(int argc, char **argv) +static void +test(void) { unsigned i; unsigned roundtrip_fails = 0; @@ -28,9 +29,21 @@ main(int argc, char **argv) if(roundtrip_fails) { printf("Failure! %u/65536 half floats failed a conversion to float and back.\n", roundtrip_fails); - return 1; - } else { - printf("Success!\n"); - return 0; + exit(1); } } + +int +main(int argc, char **argv) +{ + assert(!util_cpu_caps.has_f16c); + test(); + + /* Test f16c. */ + util_cpu_detect(); + if (util_cpu_caps.has_f16c) + test(); + + printf("Success!\n"); + return 0; +} diff --git a/src/util/half_float.c b/src/util/half_float.c index aae690a56a6..61b512f48ed 100644 --- a/src/util/half_float.c +++ b/src/util/half_float.c @@ -54,7 +54,7 @@ typedef union { float f; int32_t i; uint32_t u; } fi_type; * result in the same value as if the expression were executed on the GPU. */ uint16_t -_mesa_float_to_half(float val) +_mesa_float_to_half_slow(float val) { const fi_type fi = {val}; const int flt_m = fi.i & 0x7fffff; @@ -129,9 +129,9 @@ _mesa_float_to_half(float val) } uint16_t -_mesa_float_to_float16_rtz(float val) +_mesa_float_to_float16_rtz_slow(float val) { - return _mesa_float_to_half_rtz(val); + return _mesa_float_to_half_rtz_slow(val); } /** @@ -140,7 +140,7 @@ _mesa_float_to_float16_rtz(float val) * http://www.opengl.org/discussion_boards/ubb/Forum3/HTML/008786.html */ float -_mesa_half_to_float(uint16_t val) +_mesa_half_to_float_slow(uint16_t val) { return util_half_to_float(val); } diff --git a/src/util/half_float.h b/src/util/half_float.h index c9fad9a9400..c52bccf8d1e 100644 --- a/src/util/half_float.h +++ b/src/util/half_float.h @@ -28,6 +28,12 @@ #include #include +#include +#include "util/u_cpu_detect.h" + +#if defined(USE_X86_64_ASM) +#include +#endif #ifdef __cplusplus extern "C" { @@ -36,18 +42,65 @@ extern "C" { #define FP16_ONE ((uint16_t) 0x3c00) #define FP16_ZERO ((uint16_t) 0) -uint16_t _mesa_float_to_half(float val); -float _mesa_half_to_float(uint16_t val); +uint16_t _mesa_float_to_half_slow(float val); +float _mesa_half_to_float_slow(uint16_t val); uint8_t _mesa_half_to_unorm8(uint16_t v); uint16_t _mesa_uint16_div_64k_to_half(uint16_t v); /* - * _mesa_float_to_float16_rtz is no more than a wrapper to the counterpart + * _mesa_float_to_float16_rtz_slow is no more than a wrapper to the counterpart * softfloat.h call. Still, softfloat.h conversion API is meant to be kept * private. In other words, only use the API published here, instead of * calling directly the softfloat.h one. */ -uint16_t _mesa_float_to_float16_rtz(float val); +uint16_t _mesa_float_to_float16_rtz_slow(float val); + +static inline uint16_t +_mesa_float_to_half(float val) +{ +#if defined(USE_X86_64_ASM) + if (util_cpu_caps.has_f16c) { + __m128 in = {val}; + __m128i out; + + /* $0 = round to nearest */ + __asm volatile("vcvtps2ph $0, %1, %0" : "=v"(out) : "v"(in)); + return out[0]; + } +#endif + return _mesa_float_to_half_slow(val); +} + +static inline float +_mesa_half_to_float(uint16_t val) +{ +#if defined(USE_X86_64_ASM) + if (util_cpu_caps.has_f16c) { + __m128i in = {val}; + __m128 out; + + __asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in)); + return out[0]; + } +#endif + return _mesa_half_to_float_slow(val); +} + +static inline uint16_t +_mesa_float_to_float16_rtz(float val) +{ +#if defined(USE_X86_64_ASM) + if (util_cpu_caps.has_f16c) { + __m128 in = {val}; + __m128i out; + + /* $3 = round towards zero (truncate) */ + __asm volatile("vcvtps2ph $3, %1, %0" : "=v"(out) : "v"(in)); + return out[0]; + } +#endif + return _mesa_float_to_float16_rtz_slow(val); +} static inline uint16_t _mesa_float_to_float16_rtne(float val) diff --git a/src/util/softfloat.c b/src/util/softfloat.c index 365b15bbf0c..50cf098fd9f 100644 --- a/src/util/softfloat.c +++ b/src/util/softfloat.c @@ -1435,7 +1435,7 @@ _mesa_double_to_f32(double val, bool rtz) * From f32_to_f16() */ uint16_t -_mesa_float_to_half_rtz(float val) +_mesa_float_to_half_rtz_slow(float val) { const fi_type fi = {val}; const uint32_t flt_m = fi.u & 0x7fffff; diff --git a/src/util/softfloat.h b/src/util/softfloat.h index 4e48c6548b9..2e254e29892 100644 --- a/src/util/softfloat.h +++ b/src/util/softfloat.h @@ -56,7 +56,7 @@ double _mesa_double_mul_rtz(double a, double b); double _mesa_double_fma_rtz(double a, double b, double c); float _mesa_float_fma_rtz(float a, float b, float c); float _mesa_double_to_f32(double x, bool rtz); -uint16_t _mesa_float_to_half_rtz(float x); +uint16_t _mesa_float_to_half_rtz_slow(float x); #ifdef __cplusplus } /* extern C */