nir/lower_alu: optimize min/max signed zeros

we don't usually need a multi-instruction lowering. with the agx change in the next commit, honeykrisp results: Totals from 3589 (6.64% of 54019) affected shaders: MaxWaves: 3598144 -> 3598400 (+0.01%); split: +0.02%, -0.01% Instrs: 1445830 -> 1332394 (-7.85%) CodeSize: 10696356 -> 9742130 (-8.92%) Fills: 721 -> 723 (+0.28%); split: -0.14%, +0.42% Scratch: 3980 -> 3968 (-0.30%) ALU: 1156426 -> 1043198 (-9.79%) FSCIB: 1156426 -> 1043196 (-9.79%) IC: 267202 -> 267166 (-0.01%) GPRs: 208765 -> 208712 (-0.03%); split: -0.16%, +0.14% Uniforms: 683643 -> 683677 (+0.00%); split: -0.01%, +0.01% Preamble instrs: 1163325 -> 1159314 (-0.34%) control results alone: Totals: Instrs: 110168 -> 107171 (-2.72%) Totals from 71 (22.26% of 319) affected shaders: Instrs: 48895 -> 45898 (-6.13%) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Marek Olšák <maraeo@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35989>
2026-05-05 07:28:11 +02:00 · 2025-07-07 15:10:31 -04:00 · 2025-07-07 15:10:31 -04:00 · fc95397957
commit fc95397957
parent 042adf3cc5
1 changed files with 33 additions and 2 deletions
--- a/src/compiler/nir/nir_lower_alu.c
+++ b/src/compiler/nir/nir_lower_alu.c
@ -202,7 +202,6 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data)
      nir_def *s1 = nir_ssa_for_alu_src(b, instr, 1);

      bool max = instr->op == nir_op_fmax;
-      nir_def *iminmax = max ? nir_imax(b, s0, s1) : nir_imin(b, s0, s1);

      /* Lower the fmin/fmax to a no_signed_zero fmin/fmax. This ensures that
       * nir_lower_alu is idempotent, and allows the backend to implement
@ -212,7 +211,39 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data)
      nir_def *fminmax = max ? nir_fmax(b, s0, s1) : nir_fmin(b, s0, s1);
      b->fp_fast_math = instr->fp_fast_math;

-      lowered = nir_bcsel(b, nir_feq(b, s0, s1), iminmax, fminmax);
+      /* If we have a constant source, we can usually optimize */
+      if (s0->num_components == 1 && s0->bit_size == 32) {
+         for (unsigned i = 0; i < 2 && lowered == NULL; ++i) {
+            if (!nir_src_is_const(instr->src[i].src))
+               continue;
+
+            uint32_t x = nir_alu_src_as_uint(instr->src[i]);
+            bool pos_zero = x == fui(+0.0);
+            bool neg_zero = x == fui(-0.0);
+            nir_def *zero = i == 0 ? s0 : s1;
+            nir_def *other = i == 0 ? s1 : s0;
+
+            if (!pos_zero && !neg_zero) {
+               /* The lowering is only required when both sources are zero, so
+                * if we have a nonzero constant source, skip the lowering.
+                */
+               lowered = fminmax;
+            } else if (pos_zero && max) {
+               /* max(x, +0.0) = +0.0 < x ? x : +0.0 */
+               lowered = nir_bcsel(b, nir_flt(b, zero, other), other, zero);
+            } else if (neg_zero && !max) {
+               /* min(x, -0.0) = x < -0.0 ? x : -0.0 */
+               lowered = nir_bcsel(b, nir_flt(b, other, zero), other, zero);
+            }
+         }
+      }
+
+      /* Fallback on the emulation */
+      if (!lowered) {
+         nir_def *iminmax = max ? nir_imax(b, s0, s1) : nir_imin(b, s0, s1);
+         lowered = nir_bcsel(b, nir_feq(b, s0, s1), iminmax, fminmax);
+      }
+
      break;
   }