nir, glsl: Add support for softfloat32

Based on existing softfloat64 support and Berkeley SoftFloat. This is targeted at drivers that can't preserve denorms, so operations where denorm support is irrelevant like conversions to/from integers aren't handled. Because the existing mechanism used by Gallium for softfloat64 doesn't support includes, we unfortunately can't extract common code into a header. This can be done later if we switch Gallium to using glslang and spirv-to-nir. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37608>
2025-12-20 05:10:11 +01:00 · 2025-09-19 11:03:19 -04:00 · 2025-09-19 11:03:19 -04:00 · d30ff374a1
commit d30ff374a1
parent 9e477555c5
7 changed files with 859 additions and 31 deletions
--- a/src/compiler/glsl/float32.glsl
+++ b/src/compiler/glsl/float32.glsl
@ -0,0 +1,629 @@
+/*
+ * The implementations contained in this file are heavily based on the
+ * implementations found in the Berkeley SoftFloat library. As such, they are
+ * licensed under the same 3-clause BSD license:
+ *
+ * License for Berkeley SoftFloat Release 3e
+ *
+ * John R. Hauser
+ * 2018 January 20
+ *
+ * The following applies to the whole of SoftFloat Release 3e as well as to
+ * each source file individually.
+ *
+ * Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 The Regents of the
+ * University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions, and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions, and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the University nor the names of its contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#version 450
+#extension GL_ARB_shader_bit_encoding : enable
+#extension GL_EXT_shader_integer_mix : enable
+
+/* Enable this just to suppress warnings about __ */
+#extension GL_EXT_spirv_intrinsics : enable
+
+#pragma warning(off)
+
+/* Software IEEE floating-point rounding mode.
+ * GLSL spec section "4.7.1 Range and Precision":
+ * The rounding mode cannot be set and is undefined.
+ * But here, we are able to define the rounding mode at the compilation time.
+ */
+#define FLOAT_ROUND_NEAREST_EVEN    0
+#define FLOAT_ROUND_TO_ZERO         1
+#define FLOAT_ROUND_DOWN            2
+#define FLOAT_ROUND_UP              3
+#define FLOAT_ROUNDING_MODE         FLOAT_ROUND_NEAREST_EVEN
+
+/* Relax propagation of NaN.  Binary operations with a NaN source will still
+ * produce a NaN result, but it won't follow strict IEEE rules.
+ */
+#define RELAXED_NAN_PROPAGATION
+
+/* Returns the number of leading 0 bits before the most-significant 1 bit of
+ * `a'.  If `a' is zero, 32 is returned.
+ */
+int
+__countLeadingZeros32(uint a)
+{
+   return 31 - findMSB(a);
+}
+
+/* If a shader is in the soft-fp32 path, it almost certainly has register
+ * pressure problems.  Choose a method to exchange two values that does not
+ * require a temporary.
+ */
+#define EXCHANGE(a, b) \
+   do {                \
+       a ^= b;         \
+       b ^= a;         \
+       a ^= b;         \
+   } while (false)
+
+/* Shifts the 32-bit value `a` right by the number of bits given in `count'.
+ * If any nonzero bits are shifted off, they are "jammed" into the least
+ * significant bit of the result by setting the least significant bit to 1.
+ * The value of `count' can be arbitrarily large; in particular, if `count' is
+ * greater than 32, the result will be either 0 or 1, depending on whether `a`
+ * is zero or nonzero.
+ */
+uint
+__shift32RightJamming(uint a, int count)
+{
+   int negCount = (-count) & 31;
+
+   return mix(uint(a != 0), (a >> count) | uint(a<<negCount != 0), count < 32);
+}
+
+/* Packs the sign `zSign', exponent `zExp', and significand `zFrac' into a
+ * single-precision floating-point value, returning the result.  After being
+ * shifted into the proper positions, the three fields are simply added
+ * together to form the result.  This means that any integer portion of `zSig'
+ * will be added into the exponent.  Since a properly normalized significand
+ * will have an integer portion equal to 1, the `zExp' input should be 1 less
+ * than the desired result exponent whenever `zFrac' is a complete, normalized
+ * significand.
+ */
+uint
+__packFloat32(uint zSign, int zExp, uint zFrac)
+{
+   return zSign + (uint(zExp)<<23) + zFrac;
+}
+
+/* Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+ * and significand `zFrac', and returns the proper single-precision floating-
+ * point value corresponding to the abstract input.  Ordinarily, the abstract
+ * value is simply rounded and packed into the single-precision format, with
+ * the inexact exception raised if the abstract input cannot be represented
+ * exactly.  However, if the abstract value is too large, the overflow and
+ * inexact exceptions are raised and an infinity or maximal finite value is
+ * returned.  If the abstract value is too small, the input value is rounded to
+ * a subnormal number, and the underflow and inexact exceptions are raised if
+ * the abstract input cannot be represented exactly as a subnormal single-
+ * precision floating-point number.
+ *     The input significand `zFrac' has its binary point between bits 30
+ * and 29, which is 7 bits to the left of the usual location.  This shifted
+ * significand must be normalized or smaller.  If `zFrac' is not normalized,
+ * `zExp' must be 0; in that case, the result returned is a subnormal number,
+ * and it must not require rounding.  In the usual case that `zFrac' is
+ * normalized, `zExp' must be 1 less than the "true" floating-point exponent.
+ * The handling of underflow and overflow follows the IEEE Standard for
+ * Floating-Point Arithmetic.
+ */
+uint
+__roundAndPackFloat32(uint zSign, int zExp, uint zFrac)
+{
+   bool roundNearestEven;
+   int roundIncrement;
+   int roundBits;
+
+   roundNearestEven = FLOAT_ROUNDING_MODE == FLOAT_ROUND_NEAREST_EVEN;
+   roundIncrement = 0x40;
+   if (!roundNearestEven) {
+      if (FLOAT_ROUNDING_MODE == FLOAT_ROUND_TO_ZERO) {
+         roundIncrement = 0;
+      } else {
+         roundIncrement = 0x7F;
+         if (zSign != 0u) {
+            if (FLOAT_ROUNDING_MODE == FLOAT_ROUND_UP)
+               roundIncrement = 0;
+         } else {
+            if (FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN)
+               roundIncrement = 0;
+         }
+      }
+   }
+   roundBits = int(zFrac & 0x7Fu);
+   if (0xFDu <= uint(zExp)) {
+      if ((0xFD < zExp) || ((zExp == 0xFD) && (int(zFrac) + roundIncrement) < 0))
+         return __packFloat32(zSign, 0xFF, 0u) -
+            floatBitsToUint(float(roundIncrement == 0));
+      int count = -zExp;
+      bool zexp_lt0 = zExp < 0;
+      uint zFrac_lt0 = __shift32RightJamming(zFrac, -zExp);
+      zFrac = mix(zFrac, zFrac_lt0, zexp_lt0);
+      roundBits = mix(roundBits, int(zFrac) & 0x7f, zexp_lt0);
+      zExp = mix(zExp, 0, zexp_lt0);
+   }
+   zFrac = (zFrac + uint(roundIncrement))>>7;
+   zFrac &= ~uint(((roundBits ^ 0x40) == 0) && roundNearestEven);
+
+   return __packFloat32(zSign, mix(zExp, 0, zFrac == 0u), zFrac);
+}
+
+
+/* Absolute value of a Float32 :
+ * Clear the sign bit
+ */
+uint
+__fabs32(uint a)
+{
+   return a & 0x7FFFFFFFu;
+}
+
+/* Returns 1 if the single-precision floating-point value `a' is a NaN;
+ * otherwise returns 0.
+ */
+bool
+__is_nan(uint a)
+{
+   /* It should be safe to use the native single-precision isnan() regardless
+    * of rounding mode or denorm flushing settings.
+    */
+   return isnan(uintBitsToFloat(a));
+}
+
+/* Negate value of a Float32 :
+ * Toggle the sign bit
+ */
+uint
+__fneg32(uint a)
+{
+   return a ^ (1u << 31);
+}
+
+uint
+__fsign32(uint a)
+{
+   return mix((a & 0x80000000u) | floatBitsToUint(1.0), 0u, (a << 1) == 0u);
+}
+
+/* Returns the fraction bits of the single-precision floating-point value `a'.*/
+uint
+__extractFloat32Frac(uint a)
+{
+   return a & 0x7FFFFF;
+}
+
+/* Returns the exponent bits of the single-precision floating-point value `a'.*/
+int
+__extractFloat32Exp(uint a)
+{
+   return int((a>>23) & 0xFFu);
+}
+
+bool
+__feq32_nonnan(uint a, uint b)
+{
+   return (a == b) || ((a == 0u) && (((a | b)<<1) == 0u));
+}
+
+/* Returns true if the single-precision floating-point value `a' is equal to the
+ * corresponding value `b', and false otherwise.  The comparison is performed
+ * according to the IEEE Standard for Floating-Point Arithmetic.
+ */
+bool
+__feq32(uint a, uint b)
+{
+   if (__is_nan(a) || __is_nan(b))
+      return false;
+
+   return __feq32_nonnan(a, b);
+}
+
+/* Returns true if the single-precision floating-point value `a' is not equal
+ * to the corresponding value `b', and false otherwise.  The comparison is
+ * performed according to the IEEE Standard for Floating-Point Arithmetic.
+ */
+bool
+__fneu32(uint a, uint b)
+{
+   if (__is_nan(a) || __is_nan(b))
+      return true;
+
+   return !__feq32_nonnan(a, b);
+}
+
+/* Returns the sign bit of the single-precision floating-point value `a'.*/
+uint
+__extractFloat32Sign(uint a)
+{
+   return a & 0x80000000u;
+}
+
+bool
+__flt32_nonnan(uint a, uint b)
+{
+   /* IEEE 754 floating point numbers are specifically designed so that, with
+    * two exceptions, values can be compared by bit-casting to signed integers
+    * with the same number of bits.
+    *
+    * From https://en.wikipedia.org/wiki/IEEE_754-1985#Comparing_floating-point_numbers:
+    *
+    *    When comparing as 2's-complement integers: If the sign bits differ,
+    *    the negative number precedes the positive number, so 2's complement
+    *    gives the correct result (except that negative zero and positive zero
+    *    should be considered equal). If both values are positive, the 2's
+    *    complement comparison again gives the correct result. Otherwise (two
+    *    negative numbers), the correct FP ordering is the opposite of the 2's
+    *    complement ordering.
+    *
+    * The logic implied by the above quotation is:
+    *
+    *    !both_are_zero(a, b) && (both_negative(a, b) ? a > b : a < b)
+    *
+    * This is equivalent to
+    *
+    *    fneu(a, b) && (both_negative(a, b) ? a >= b : a < b)
+    *
+    *    fneu(a, b) && (both_negative(a, b) ? !(a < b) : a < b)
+    *
+    *    fneu(a, b) && ((both_negative(a, b) && !(a < b)) ||
+    *                  (!both_negative(a, b) && (a < b)))
+    *
+    * (A!|B)&(A|!B) is (A xor B) which is implemented here using !=.
+    *
+    *    fneu(a, b) && (both_negative(a, b) != (a < b))
+    */
+   bool lt = a < b;
+   bool both_negative = (a & b & 0x80000000u) != 0;
+
+   return !__feq32_nonnan(a, b) && (lt != both_negative);
+}
+
+bool
+__flt32_nonnan_minmax(uint a, uint b)
+{
+
+   /* See __flt32_nonnan. For implementing fmin/fmax, we compare -0 < 0, so the
+    * implied logic is a bit simpler:
+    *
+    *    both_negative(a, b) ? a > b : a < b
+    *
+    * If a == b, it doesn't matter what we return, so that's equivalent to:
+    *
+    *    both_negative(a, b) ? a >= b : a < b
+    *    both_negative(a, b) ? !(a < b) : a < b
+    *    both_negative(a, b) ^ (a < b)
+    *
+    * XOR is again implemented using !=.
+    */
+   bool lt = a < b;
+   bool both_negative = (a & b & 0x80000000u) != 0;
+
+   return (lt != both_negative);
+}
+
+/* Returns true if the single-precision floating-point value `a' is less than
+ * the corresponding value `b', and false otherwise.  The comparison is performed
+ * according to the IEEE Standard for Floating-Point Arithmetic.
+ */
+bool
+__flt32(uint a, uint b)
+{
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool x = !__is_nan(a);
+   bool y = !__is_nan(b);
+   bool z = __flt32_nonnan(a, b);
+
+   return (x && y && z);
+}
+
+/* Returns true if the single-precision floating-point value `a' is greater
+ * than or equal to * the corresponding value `b', and false otherwise.  The
+ * comparison is performed * according to the IEEE Standard for Floating-Point
+ * Arithmetic.
+ */
+bool
+__fge32(uint a, uint b)
+{
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool x = !__is_nan(a);
+   bool y = !__is_nan(b);
+   bool z = !__flt32_nonnan(a, b);
+
+   return (x && y && z);
+}
+
+uint
+fsat32(uint a)
+{
+   /* fsat(NaN) should be zero. */
+   if (__is_nan(a) || int(a) < 0)
+      return 0u;
+
+   /* IEEE 754 floating point numbers are specifically designed so that, with
+    * two exceptions, values can be compared by bit-casting to signed integers
+    * with the same number of bits.
+    *
+    * From https://en.wikipedia.org/wiki/IEEE_754-1985#Comparing_floating-point_numbers:
+    *
+    *    When comparing as 2's-complement integers: If the sign bits differ,
+    *    the negative number precedes the positive number, so 2's complement
+    *    gives the correct result (except that negative zero and positive zero
+    *    should be considered equal). If both values are positive, the 2's
+    *    complement comparison again gives the correct result. Otherwise (two
+    *    negative numbers), the correct FP ordering is the opposite of the 2's
+    *    complement ordering.
+    *
+    * We know that both values are not negative, and we know that at least one
+    * value is not zero.  Therefore, we can just use the 2's complement
+    * comparison ordering.
+    */
+   if (floatBitsToUint(1.0) < a)
+      return floatBitsToUint(1.0);
+
+   return a;
+}
+
+/* Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+ * and significand `zSig', and returns the proper single-precision
+ * floating-point value corresponding to the abstract input.  This routine is
+ * just like `__roundAndPackFloat32' except that the input significand has
+ * fewer bits and does not have to be normalized.  In all cases, `zExp' must be
+ * 1 less than the "true" floating- point exponent.
+ */
+uint
+__normalizeRoundAndPackFloat32(uint zSign,
+                               int zExp,
+                               uint zFrac)
+{
+   int shiftCount;
+
+   shiftCount = __countLeadingZeros32(zFrac) - 1;
+   return __roundAndPackFloat32(zSign, zExp - shiftCount, zFrac<<shiftCount);
+}
+
+uint
+__propagateFloat32NaNInfAdd(uint a, uint b)
+{
+   return floatBitsToUint(uintBitsToFloat(a) + uintBitsToFloat(b));
+}
+
+uint
+__propagateFloat32NaNInfMul(uint a, uint b)
+{
+   return floatBitsToUint(uintBitsToFloat(a) * uintBitsToFloat(b));
+}
+
+/* Returns the result of adding the single-precision floating-point values
+ * `a' and `b'.  The operation is performed according to the IEEE Standard for
+ * Floating-Point Arithmetic.
+ */
+uint
+__fadd32(uint a, uint b)
+{
+   uint aSign = __extractFloat32Sign(a);
+   uint bSign = __extractFloat32Sign(b);
+   uint aFrac = __extractFloat32Frac(a);
+   uint bFrac = __extractFloat32Frac(b);
+   int aExp = __extractFloat32Exp(a);
+   int bExp = __extractFloat32Exp(b);
+   int expDiff = aExp - bExp;
+   if (aSign == bSign) {
+      uint zFrac;
+      int zExp;
+      aFrac <<= 6;
+      bFrac <<= 6;
+      if (expDiff == 0) {
+         if (aExp == 0xFF)
+            return __propagateFloat32NaNInfAdd(a, b);
+         if (aExp == 0)
+            return __packFloat32(aSign, 0, (aFrac + bFrac)>>6);
+         zFrac = 0x40000000 + aFrac + bFrac;
+         zExp = aExp;
+      } else {
+         if (expDiff < 0) {
+            EXCHANGE(aFrac, bFrac);
+            EXCHANGE(aExp, bExp);
+         }
+
+         if (aExp == 0xFF)
+            return __propagateFloat32NaNInfAdd(a, b);
+
+         expDiff = mix(abs(expDiff), abs(expDiff) - 1, bExp == 0);
+         bFrac = mix(bFrac | 0x20000000u, bFrac, bExp == 0);
+         bFrac = __shift32RightJamming(bFrac, expDiff);
+         zExp = aExp;
+
+         aFrac |= 0x20000000;
+         zFrac = (aFrac + bFrac)<<1;
+         --zExp;
+         if (int(zFrac) < 0) {
+            zFrac = aFrac + bFrac;
+            ++zExp;
+         }
+      }
+      return __roundAndPackFloat32(aSign, zExp, zFrac);
+   } else {
+      int zExp;
+
+      aFrac <<= 7;
+      bFrac <<= 7;
+      if (expDiff != 0) {
+         uint zFrac;
+
+         if (expDiff < 0) {
+            EXCHANGE(aFrac, bFrac);
+            EXCHANGE(aExp, bExp);
+            aSign ^= 0x80000000u;
+         }
+         if (aExp == 0xFF)
+            return __propagateFloat32NaNInfAdd(a, b);
+
+         expDiff = mix(abs(expDiff), abs(expDiff) - 1, bExp == 0);
+         bFrac = mix(bFrac | 0x40000000u, bFrac, bExp == 0);
+         bFrac = __shift32RightJamming(bFrac, expDiff);
+         aFrac |= 0x40000000;
+         zFrac = aFrac - bFrac;
+         zExp = aExp;
+         --zExp;
+         return __normalizeRoundAndPackFloat32(aSign, zExp, zFrac);
+      }
+      if (aExp == 0xFF)
+         return __propagateFloat32NaNInfAdd(a, b);
+      bExp = mix(bExp, 1, aExp == 0);
+      aExp = mix(aExp, 1, aExp == 0);
+
+      uint zFrac;
+      uint sign_of_difference = 0;
+      if (bFrac <= aFrac) {
+         /* It is possible that zFrac may be zero after this. */
+         zFrac = aFrac - bFrac;
+      } else {
+         zFrac = bFrac - aFrac;
+         sign_of_difference = 0x80000000;
+      }
+      zExp = mix(bExp, aExp, sign_of_difference == 0u);
+      aSign ^= sign_of_difference;
+      uint retval_0 = __packFloat32(uint(FLOAT_ROUNDING_MODE == FLOAT_ROUND_DOWN) << 31, 0, 0u);
+      uint retval_1 = __normalizeRoundAndPackFloat32(aSign, zExp, zFrac);
+      return mix(retval_0, retval_1, zFrac != 0u);
+   }
+}
+
+/* Normalizes the subnormal single-precision floating-point value represented
+ * by the denormalized significand `aFrac'.  The normalized exponent and
+ * significand are stored at the locations pointed to by `zExpPtr' and
+`* `zFracPtr', respectively.
+ */
+void
+__normalizeFloat32Subnormal(uint aFrac,
+                            out int zExpPtr,
+                            out uint zFracPtr)
+{
+   int shiftCount;
+
+   shiftCount = __countLeadingZeros32(aFrac) - 8;
+   zFracPtr = aFrac << shiftCount;
+   zExpPtr = 1 - shiftCount;
+}
+
+/* Returns the result of multiplying the single-precision floating-point values
+ * `a' and `b'.  The operation is performed according to the IEEE Standard for
+ * Floating-Point Arithmetic.
+ */
+uint
+__fmul32(uint a, uint b)
+{
+   uint zFrac0 = 0u;
+   uint zFrac1 = 0u;
+   int zExp;
+
+   uint aFrac = __extractFloat32Frac(a);
+   uint bFrac = __extractFloat32Frac(b);
+   int aExp = __extractFloat32Exp(a);
+   uint aSign = __extractFloat32Sign(a);
+   int bExp = __extractFloat32Exp(b);
+   uint bSign = __extractFloat32Sign(b);
+   uint zSign = aSign ^ bSign;
+   if (aExp == 0xFF) {
+      /* Subnormal values times infinity equals infinity, but other cases can
+       * use the builtin multiply that may flush denorms to 0.
+       */
+      if (aFrac != 0u || ((bExp == 0xFF) && bFrac != 0) || (bExp | bFrac) == 0)
+         return __propagateFloat32NaNInfMul(a, b);
+      return __packFloat32(zSign, 0xFF, 0);
+   }
+   if (bExp == 0xFF) {
+      if (bFrac != 0u || (aExp | aFrac) == 0)
+         return __propagateFloat32NaNInfMul(a, b);
+      return __packFloat32(zSign, 0xFF, 0u);
+   }
+   if (aExp == 0) {
+      if (aFrac == 0u)
+         return __packFloat32(zSign, 0, 0u);
+      __normalizeFloat32Subnormal(aFrac, aExp, aFrac);
+   }
+   if (bExp == 0) {
+      if (bFrac == 0u)
+         return __packFloat32(zSign, 0, 0u);
+      __normalizeFloat32Subnormal(bFrac, bExp, bFrac);
+   }
+   zExp = aExp + bExp - 0x7F;
+   aFrac = ( aFrac | 0x00800000 )<<7;
+   bFrac = ( bFrac | 0x00800000 )<<8;
+   umulExtended(aFrac, bFrac, zFrac0, zFrac1);
+   zFrac0 |= uint(zFrac1 != 0);
+   if (0 < int(zFrac0 << 1)) {
+      zFrac0 <<= 1;
+      --zExp;
+   }
+   return __roundAndPackFloat32(zSign, zExp, zFrac0);
+}
+
+uint
+__ffma32(uint a, uint b, uint c)
+{
+   return __fadd32(__fmul32(a, b), c);
+}
+
+uint
+__fmin32(uint a, uint b)
+{
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool b_nan = __is_nan(b);
+   bool a_lt_b = __flt32_nonnan_minmax(a, b);
+   bool a_nan = __is_nan(a);
+
+   return (b_nan || a_lt_b) && !a_nan ? a : b;
+}
+
+uint
+__fmax32(uint a, uint b)
+{
+   /* This weird layout matters.  Doing the "obvious" thing results in extra
+    * flow control being inserted to implement the short-circuit evaluation
+    * rules.  Flow control is bad!
+    */
+   bool b_nan = __is_nan(b);
+   bool a_lt_b = __flt32_nonnan_minmax(a, b);
+   bool a_nan = __is_nan(a);
+
+   return (!b_nan && a_lt_b) || a_nan ? b : a;
+}
+
--- a/src/compiler/meson.build
+++ b/src/compiler/meson.build
@ -25,6 +25,7 @@ builtin_types_c = custom_target(
 float64_glsl_file = files('glsl/float64.glsl')

 astc_decoder_glsl_file = files('glsl/astc_decoder.glsl')
+float32_glsl_file = files('glsl/float32.glsl')

 files_libcompiler = files(
  'glsl_types.c',
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -166,6 +166,7 @@ else
  'nir_lower_explicit_io.c',
  'nir_lower_fb_read.c',
  'nir_lower_flatshade.c',
+  'nir_lower_floats.c',
  'nir_lower_flrp.c',
  'nir_lower_fp16_conv.c',
  'nir_lower_fragcoord_wtrans.c',
@ -326,6 +327,7 @@ else
  'nir_serialize.h',
  'nir_shader_bisect.c',
  'nir_shader_compiler_options.h',
+  'nir_softfloat.h',
  'nir_split_64bit_vec3_and_vec4.c',
  'nir_split_conversions.c',
  'nir_split_per_member_structs.c',
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -6063,6 +6063,7 @@ bool nir_lower_int64_float_conversions(nir_shader *shader);
 nir_lower_doubles_options nir_lower_doubles_op_to_options_mask(nir_op opcode);
 bool nir_lower_doubles(nir_shader *shader, const nir_shader *softfp64,
                       nir_lower_doubles_options options);
+bool nir_lower_floats(nir_shader *shader, const nir_shader *softfp32);
 bool nir_lower_pack(nir_shader *shader);

 nir_intrinsic_instr *nir_get_io_intrinsic(nir_instr *instr, nir_variable_mode modes,
--- a/src/compiler/nir/nir_lower_double_ops.c
+++ b/src/compiler/nir/nir_lower_double_ops.c
@ -24,6 +24,7 @@

 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_softfloat.h"

 #include <float.h>
 #include <math.h>
@ -692,37 +693,7 @@ lower_doubles_instr_to_soft(nir_builder *b, nir_alu_instr *instr,
      assert(func);
   }

-   nir_def *params[4] = {
-      NULL,
-   };
-
-   nir_variable *ret_tmp =
-      nir_local_variable_create(b->impl, return_type, "return_tmp");
-   nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp);
-   params[0] = &ret_deref->def;
-
-   assert(nir_op_infos[instr->op].num_inputs + 1 == func->num_params);
-   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
-      nir_alu_type n_type =
-         nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[i]);
-      /* Add bitsize */
-      n_type = n_type | instr->src[0].src.ssa->bit_size;
-
-      const struct glsl_type *param_type =
-         glsl_scalar_type(nir_get_glsl_base_type_for_nir_type(n_type));
-
-      nir_variable *param =
-         nir_local_variable_create(b->impl, param_type, "param");
-      nir_deref_instr *param_deref = nir_build_deref_var(b, param);
-      nir_store_deref(b, param_deref, nir_mov_alu(b, instr->src[i], 1), ~0);
-
-      assert(i + 1 < ARRAY_SIZE(params));
-      params[i + 1] = &param_deref->def;
-   }
-
-   nir_inline_function_impl(b, func->impl, params, NULL);
-
-   return nir_load_deref(b, ret_deref);
+   return nir_lower_softfloat_func(b, instr, func, return_type);
 }

 nir_lower_doubles_options
--- a/src/compiler/nir/nir_lower_floats.c
+++ b/src/compiler/nir/nir_lower_floats.c
@ -0,0 +1,156 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ * Copyright © 2025 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_softfloat.h"
+
+static nir_def *
+lower_float_instr_to_soft(nir_builder *b, nir_instr *instr,
+                          void *data)
+{
+   const char *mangled_name;
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   const struct glsl_type *return_type = glsl_uint_type();
+   const nir_shader *softfp32 = data;
+
+   switch (alu->op) {
+   case nir_op_fabs:
+      mangled_name = "__fabs32(u1;";
+      break;
+   case nir_op_fneg:
+      mangled_name = "__fneg32(u1;";
+      break;
+   case nir_op_fsign:
+      mangled_name = "__fsign32(u1;";
+      break;
+   case nir_op_feq:
+      mangled_name = "__feq32(u1;u1;";
+      return_type = glsl_bool_type();
+      break;
+   case nir_op_fneu:
+      mangled_name = "__fneu32(u1;u1;";
+      return_type = glsl_bool_type();
+      break;
+   case nir_op_flt:
+      mangled_name = "__flt32(u1;u1;";
+      return_type = glsl_bool_type();
+      break;
+   case nir_op_fge:
+      mangled_name = "__fge32(u1;u1;";
+      return_type = glsl_bool_type();
+      break;
+   case nir_op_fmin:
+      mangled_name = "__fmin32(u1;u1;";
+      break;
+   case nir_op_fmax:
+      mangled_name = "__fmax32(u1;u1;";
+      break;
+   case nir_op_fadd:
+      mangled_name = "__fadd32(u1;u1;";
+      break;
+   case nir_op_fmul:
+      mangled_name = "__fmul32(u1;u1;";
+      break;
+   case nir_op_ffma:
+      mangled_name = "__ffma32(u1;u1;u1;";
+      break;
+   case nir_op_fsat:
+      mangled_name = "__fsat32(u1;";
+      break;
+   default:
+      return NULL;
+   }
+
+   /* Some of the implementations use floating-point primitives in a way where
+    * rounding mode and denorm mode does not matter, for example to propagate
+    * NaNs. By inserting everything before the instruction we avoid iterating
+    * over the inlined instructions again and avoid calling the lowering on
+    * them, avoiding infinite loops.
+    */
+   b->cursor = nir_before_instr(instr);
+
+   nir_function *func = nir_shader_get_function_for_name(softfp32, mangled_name);
+
+   if (!func || !func->impl) {
+      fprintf(stderr, "Cannot find function \"%s\"\n", mangled_name);
+      assert(func);
+   }
+
+   return nir_lower_softfloat_func(b, alu, func, return_type);
+}
+
+static bool
+should_lower_float_instr(const nir_instr *instr, const void *_data)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   return alu->src[0].src.ssa->bit_size == 32;
+}
+
+static bool
+nir_lower_floats_impl(nir_function_impl *impl,
+                      const nir_shader *softfp32)
+{
+   bool progress =
+      nir_function_impl_lower_instructions(impl,
+                                           should_lower_float_instr,
+                                           lower_float_instr_to_soft,
+                                           (void *)softfp32);
+
+   if (progress) {
+      /* Indices are completely messed up now */
+      nir_index_ssa_defs(impl);
+
+      nir_progress(true, impl, nir_metadata_none);
+
+      /* And we have deref casts we need to clean up thanks to function
+       * inlining.
+       */
+      nir_opt_deref_impl(impl);
+   } else
+      nir_progress(progress, impl, nir_metadata_control_flow);
+
+   return progress;
+}
+
+/* Some implementations do not implement preserving denorms for
+ * single-precision floats. This implements lowering those to softfloats when
+ * denorms are forced on.
+ */
+bool
+nir_lower_floats(nir_shader *shader,
+                 const nir_shader *softfp32)
+{
+   bool progress = false;
+
+   nir_foreach_function_impl(impl, shader) {
+      progress |= nir_lower_floats_impl(impl, softfp32);
+   }
+
+   return progress;
+}
--- a/src/compiler/nir/nir_softfloat.h
+++ b/src/compiler/nir/nir_softfloat.h
@ -0,0 +1,68 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ * Copyright © 2025 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+
+static inline nir_def *
+nir_lower_softfloat_func(nir_builder *b,
+                         nir_alu_instr *instr,
+                         nir_function *softfloat_func,
+                         const struct glsl_type *return_type)
+{
+   nir_def *params[4] = {
+      NULL,
+   };
+
+   nir_variable *ret_tmp =
+      nir_local_variable_create(b->impl, return_type, "return_tmp");
+   nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp);
+   params[0] = &ret_deref->def;
+
+   assert(nir_op_infos[instr->op].num_inputs + 1 == softfloat_func->num_params);
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      nir_alu_type n_type =
+         nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[i]);
+      /* Add bitsize */
+      n_type = n_type | instr->src[0].src.ssa->bit_size;
+
+      const struct glsl_type *param_type =
+         glsl_scalar_type(nir_get_glsl_base_type_for_nir_type(n_type));
+
+      nir_variable *param =
+         nir_local_variable_create(b->impl, param_type, "param");
+      nir_deref_instr *param_deref = nir_build_deref_var(b, param);
+      nir_store_deref(b, param_deref, nir_mov_alu(b, instr->src[i], 1), ~0);
+
+      assert(i + 1 < ARRAY_SIZE(params));
+      params[i + 1] = &param_deref->def;
+   }
+
+   nir_inline_function_impl(b, softfloat_func->impl, params, NULL);
+
+   return nir_load_deref(b, ret_deref);
+}
+