nir/lower_double_ops: handle signed zero with min/max

Ensure the following identities hold to match IEEE-754-2019 and upcoming NIR: min(-0, +0) = -0 min(+0, -0) = -0 max(-0, +0) = +0 max(+0, -0) = +0 NVK uses this lowering. In a simple compute shader using fmin64 on an SSBO with signed zero preserve required, testing the effect of this patch, the instruction count goes from 47->52. Obviously I'm not thrilled by that but I also couldn't find any obvious way of mitigating the issue. (Maybe NVIDIA has special hardware support here. By instruction count, lowering all the way to int64 is a loss, though I don't know how to count cycles on NVIDIA.) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30075>
2026-05-05 13:58:04 +02:00 · 2024-07-08 13:29:22 -04:00 · 2024-07-08 13:29:22 -04:00 · 4ab3d95c11
commit 4ab3d95c11
parent 26de3d5366
1 changed files with 17 additions and 0 deletions
--- a/src/compiler/nir/nir_lower_double_ops.c
+++ b/src/compiler/nir/nir_lower_double_ops.c
@ -499,6 +499,23 @@ lower_minmax(nir_builder *b, nir_op cmp, nir_def *src0, nir_def *src1)
   nir_def *cmp_res = nir_build_alu2(b, cmp, src0, src1);
   b->exact = false;
   nir_def *take_src0 = nir_ior(b, src1_is_nan, cmp_res);
+
+   /* IEEE-754-2019 requires that fmin/fmax compare -0 < 0, but -0 and 0 are
+    * indistinguishable for flt/fge. So, we fix up signed zeroes.
+    */
+   if (nir_is_float_control_signed_zero_preserve(b->fp_fast_math, 64)) {
+      nir_def *src0_is_negzero = nir_ieq_imm(b, src0, 1ull << 63);
+      nir_def *src1_is_poszero = nir_ieq_imm(b, src1, 0x0);
+      nir_def *neg_pos_zero = nir_iand(b, src0_is_negzero, src1_is_poszero);
+
+      if (cmp == nir_op_flt) {
+         take_src0 = nir_ior(b, take_src0, neg_pos_zero);
+      } else {
+         assert(cmp == nir_op_fge);
+         take_src0 = nir_iand(b, take_src0, nir_inot(b, neg_pos_zero));
+      }
+   }
+
   return nir_bcsel(b, take_src0, src0, src1);
 }