nir/opt_algebraic_tests: Test !nir_fp_preserve_signed_zero behavior.

Iterate over a set of sign-flips for 0.0s to see if we can find a set that makes the search and replace sides match. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39369>
2026-05-06 09:28:07 +02:00 · 2026-01-16 22:27:01 -08:00 · 2026-01-16 22:27:01 -08:00 · 7fd0287a89
commit 7fd0287a89
parent d9eb9a3eef
3 changed files with 108 additions and 43 deletions
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -268,10 +268,10 @@ optimizations = [
   (('fadd', a, a), ('fmul', a, 2.0)),
   (('fadd(contract)', a, ('fadd(is_used_once)', a, b)), ('fadd', b, ('fmul', a, 2.0))),
   (('~fmul', a, 0.0), 0.0),
-   (('~fmul', a, -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for inexactly testing -0.0 inputs
+   (('~fmul', a, -0.0), 0.0),
   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
   (('fmul(nsz,nnan)', 'a', 0.0), 0.0),
-   (('fmul(nsz,nnan)', 'a', -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
+   (('fmul(nsz,nnan)', 'a', -0.0), 0.0),
   (('fmulz', a, 0.0), 0.0),
   (('fmulz', a, -0.0), 0.0),
   (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)),
@ -294,7 +294,7 @@ optimizations = [
   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
   (('ffma(nsz,nnan)', 0.0, a, b), ('fcanonicalize', b)),
-   (('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
+   (('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b)),
   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
   (('ffmaz', -0.0, a, b), ('fadd', 0.0, b)),
   (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)),
@ -308,11 +308,11 @@ optimizations = [
   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
   (('flrp(nnan,nsz)', a, b, 0.0), ('fcanonicalize', a)),
-   (('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
+   (('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a)),
   (('flrp(nnan,nsz)', a, b, 1.0), ('fcanonicalize', b)),
   (('~flrp', a, a, b), ('fcanonicalize', a)),
   (('flrp(nnan,nsz)', 0.0, a, b), ('fmul', a, b)),
-   (('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
+   (('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b)),

   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
@ -974,8 +974,8 @@ optimizations.extend([
   (('fneg', ('fmin(is_used_once)', ('fneg', a), ('fneg', b))), ('fmax', a, b)),
   (('fneg', ('fmax(is_used_once)', ('fneg', a), '#b')), ('fmin', a, ('fneg', b))),
   (('fneg', ('fmin(is_used_once)', ('fneg', a), '#b')), ('fmax', a, ('fneg', b))),
-   (('fmin(nsz)', a, -0.0), ('fmin', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
-   (('fmax(nsz)', a, -0.0), ('fmax', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
+   (('fmin(nsz)', a, -0.0), ('fmin', a, 0.0)),
+   (('fmax(nsz)', a, -0.0), ('fmax', a, 0.0)),
 ])

 for op in ['ine', 'ieq', 'ilt', 'ige', 'ult', 'uge', 'bitz', 'bitnz',
@ -3463,7 +3463,7 @@ optimizations.extend([
     ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
               ('iand', a, 1 << 31),
               ('!f2f32', ('!f2f16_rtne', a))),
-     'options->lower_fquantize2f16', TestStatus.UNSUPPORTED), # All test inputs skipped.
+     'options->lower_fquantize2f16'),
    ])

 for s in range(0, 31):
@ -4182,8 +4182,6 @@ for t in ['f', 'i']:
   add_used_once = '~{}add(is_used_once)'.format(t)
   add = '~{}add'.format(t)
   mul = '~{}mul'.format(t)
-   # With so many inputs, we always get an inf or a nan in testing.
-   expected = TestStatus.UNSUPPORTED if t == 'f' else TestStatus.PASS

   # Variable names used below were selected based on these layouts:
   #     mat4             mat4         vec4
@ -4203,10 +4201,10 @@ for t in ['f', 'i']:
   step8 = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), (mul, 'ff', 'jj'))

   # This finds and replaces common (mat4*mat4)*vec4 with something that will get optimised down to mat4*(mat4*vec4)
-   mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)]
+   mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))]

   # This helps propagate the above improvement further up the mul chain e.g. mat4*mat4*mat4*vec4 to (mat4*vec4)*mat4*mat4
-   mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)]
+   mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))]

   # Below handles a real world shader that looks like this mat4*mat4*vec4(xyz, 1.0) where the the multiplication of the 1.0 constant has been optimised away
   step5_no_w_mul = (add, (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), (mul, 's', 'ii')), 't')
@ -4214,7 +4212,7 @@ for t in ['f', 'i']:
   step7_no_w_mul = (add, (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), (mul, 'aa', 'ii')), 'bb')
   step8_no_w_mul = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), 'ff')

-   mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')), 'true', expected)]
+   mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')))]

   # Below handles a real world shader that looks like this mat4*mat4*vec4(xy, 0.0, 1.0) where the the multiplication of the 0.0 and 1.0 constants have been optimised away
   step5_zero_z_no_w_mul = (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), 't')
@ -4222,7 +4220,7 @@ for t in ['f', 'i']:
   step7_zero_z_no_w_mul = (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), 'bb')
   step8_zero_z_no_w_mul = (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), 'ff')

-   mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')), 'true', expected)]
+   mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')))]

 before_lower_int64_optimizations = [
    # The i2i64(a) implies that 'a' has at most 32-bits of data.
--- a/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp
+++ b/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp
@ -199,6 +199,37 @@ static const double float_inputs[INPUT_VALUE_COUNT] = {
   DBL_MIN,
 };

+void
+nir_algebraic_pattern_test::handle_signed_zero(nir_const_value *val, uint32_t bit_size)
+{
+   if (bit_size < 16 || nir_const_value_as_float(*val, bit_size) != 0.0)
+      return;
+
+   /* If we're preserving signed zeroes, no need to do any of this work. */
+   if (exact && (fp_math_ctrl & nir_fp_preserve_signed_zero))
+      return;
+
+   if (signed_zero_iter != 0) {
+      if (signed_zero_count < 32 &&
+          (1u << signed_zero_count) & signed_zero_iter) {
+         switch (bit_size) {
+         case 16:
+            val->u16 ^= 0x8000;
+            break;
+         case 32:
+            val->f32 = -val->f32;
+            break;
+         case 64:
+            val->f64 = -val->f64;
+            break;
+         default:
+            UNREACHABLE("bad bit size");
+         }
+      }
+   }
+   signed_zero_count++;
+}
+
 bool
 nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size,
                                      nir_const_value tmp, int32_t src_index)
@ -206,18 +237,6 @@ nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size,
   /* Always pass the test for signed zero/nan/inf sources if they are not preserved. */
   if (bit_size >= 16) {
      double val = nir_const_value_as_float(tmp, bit_size);
-      if ((!exact || !(fp_math_ctrl & nir_fp_preserve_signed_zero)) && val == 0.0 && signbit(val)) {
-         /* TODO: Could be more permissive in covering input values -- right now
-          * we skip if either before or after ever consume or produce a -0.0,
-          * but if the result was unchanged by the 0.0 signs of the srcs, or if
-          * the two sides agreed about the sign of 0.0s produced, we could test
-          * that the rest of the expression evaluated correctly.
-          *
-          * Also, the fp preserve flags should probably not apply to non-float
-          * uses/outputs!
-          */
-         return true;
-      }
      if ((!exact || !(fp_math_ctrl & nir_fp_preserve_nan)) && isnan(val))
         return true;
      if ((!exact || !(fp_math_ctrl & nir_fp_preserve_inf)) && isinf(val))
@ -344,8 +363,10 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr)

      for (uint32_t j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) {
         nir_const_value tmp = tmp_value(alu->src[i].src.ssa)[alu->src[i].swizzle[j]];
-         src[i][j] = tmp;
+         if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float)
+            handle_signed_zero(&tmp, alu->src[i].src.ssa->bit_size);

+         src[i][j] = tmp;
         if (skip_test(alu, alu->src[i].src.ssa->bit_size, tmp, i))
            return true;
      }
@ -372,6 +393,8 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr)
      return true;

   for (uint32_t comp = 0; comp < alu->def.num_components; comp++) {
+      if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float)
+         handle_signed_zero(&dest[comp], alu->def.bit_size);
      if (skip_test(alu, bit_size, dest[comp], -1))
         return true;
   }
@ -491,26 +514,59 @@ nir_algebraic_pattern_test::validate_pattern()
      if (!check_variable_conds())
         continue;

-      nir_foreach_instr(instr, block) {
-         bool is_assert = (instr->type == nir_instr_type_intrinsic &&
-                           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq);
-         if (evaluate_expression(instr)) {
-            if (is_assert) {
-               if (result == UNSUPPORTED)
-                  result = PASS;
+      /* Loop over the set of 0.0 sign flips we want to try to see if the
+       * pattern works that way, given the NIR spec for
+       * !nir_fp_preserve_signed_zero of "any -0.0 or +0.0 output can have
+       * either sign, and any zero input can be treated as having opposite sign.
+       */
+      uint32_t saved_signed_zero_count = 0;
+      enum result seed_result = UNSUPPORTED;
+      for (signed_zero_iter = 0; signed_zero_iter < 1u << MIN2(4, saved_signed_zero_count); signed_zero_iter++) {
+         /* This will get incremented as we evaluate the instrs. */
+         signed_zero_count = 0;
+
+         /* Loop over the instructions evaluating them given the inputs and this set of signed zero flips. */
+         nir_foreach_instr(instr, block) {
+            bool is_assert = (instr->type == nir_instr_type_intrinsic &&
+                              nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq);
+
+            if (evaluate_expression(instr)) {
+               if (is_assert)
+                  seed_result = PASS;
+               break;
+            } else {
+               if (is_assert) {
+                  seed_result = FAIL;
+               }
            }
-            break;
-         } else if (is_assert) {
-            result = FAIL;
         }
+
+         if (signed_zero_iter == 0)
+            saved_signed_zero_count = signed_zero_count;
+
+         if (seed_result == PASS)
+            break;
      }

-      /* If we found a fail, break out so we can print the shader with the
-       * failing values.  Otherwise, continue iterating over input values to
-       * find any broken ones.
-       */
-      if (result == FAIL)
-         break;
+      if (seed_result == PASS) {
+         result = PASS;
+         /* Don't break out of the loop that feeds us new inputs -- we want to continue to test the rest to find a failure. */
+      } else if (seed_result == UNSUPPORTED) {
+         /* The test skipped for these inputs, don't change the final result. */
+      } else {
+         bool sz_non_exhaustive = saved_signed_zero_count > 31 || signed_zero_iter < (1u << saved_signed_zero_count);
+         if (sz_non_exhaustive) {
+            /* We don't seem to trigger this case in practice. */
+            printf("Skipping test input due to too many signed zeroes to exhaustively test.\n");
+         } else {
+            /* We got a fail result with every combination of
+             * nir_fp_preserve_signed_zero bit flips applied. Break out so we
+             * can print the shader with the failing values.
+             */
+            result = FAIL;
+            break;
+         }
+      }
   }

   ASSERT_EQ(result, expected_result);
--- a/src/compiler/nir/tests/nir_algebraic_pattern_test.h
+++ b/src/compiler/nir/tests/nir_algebraic_pattern_test.h
@ -75,12 +75,23 @@ class nir_algebraic_pattern_test : public nir_test {
   bool evaluate_expression(nir_instr *instr);
   bool skip_test(nir_alu_instr *alu, uint32_t bit_size,
                  nir_const_value tmp, int32_t src_index);
+   void handle_signed_zero(nir_const_value *val, uint32_t bit_size);

 public:
   nir_const_value *tmp_value(nir_def *def);

   std::vector<nir_algebraic_pattern_test_input> inputs;
   uint32_t fuzzing_bits;
+
+   /* Iteration count for signed 0 non-preservation search -- we set up
+    * signed_zero_count during the first iteration, and during other iterations
+    * we'll flip signs to try to see if the pattern ever matches.
+    */
+   uint32_t signed_zero_iter;
+
+   /* Number of 0.0s encountered in the current signed_zero_iter. */
+   uint32_t signed_zero_count;
+
   bool exact = true;
   enum result expected_result = PASS;
   const char *expression_cond_failed = NULL;