nir/algebraic: Elide range clamping of f2u sources

There are no shader-db changes on ELK platforms because those platforms don't support 8- or 16-bit integer types. v2: Restrict patterns generated such that the integer limits are exactly representable in the specified floating point format. With the exception of the value 0, this requires that float_sz > int_sz. This had no impact on shader-db or fossil-db on any Intel platform. Noticed by Georg. v3: Add a missing is_a_number. shader-db: All Intel platforms had similar results. (Lunar Lake shown) total cycles in shared programs: 889936056 -> 889934082 (<.01%) cycles in affected programs: 65806 -> 63832 (-3.00%) helped: 2 / HURT: 0 fossil-db: Lunar Lake Totals: Instrs: 233284796 -> 233282917 (-0.00%); split: -0.00%, +0.00% Cycle count: 32756399804 -> 32754972188 (-0.00%); split: -0.01%, +0.00% Spill count: 519861 -> 519813 (-0.01%) Fill count: 663650 -> 663626 (-0.00%); split: -0.01%, +0.01% Max live registers: 71738626 -> 71738696 (+0.00%) Non SSA regs after NIR: 67837902 -> 67837648 (-0.00%) Totals from 1236 (0.16% of 790723) affected shaders: Instrs: 2134504 -> 2132625 (-0.09%); split: -0.09%, +0.01% Cycle count: 604922278 -> 603494662 (-0.24%); split: -0.48%, +0.25% Spill count: 16509 -> 16461 (-0.29%) Fill count: 32760 -> 32736 (-0.07%); split: -0.22%, +0.15% Max live registers: 250112 -> 250182 (+0.03%) Non SSA regs after NIR: 302368 -> 302114 (-0.08%) Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown) Totals: Instrs: 264095370 -> 264094056 (-0.00%); split: -0.00%, +0.00% Cycle count: 26554146277 -> 26553027268 (-0.00%); split: -0.01%, +0.01% Spill count: 530603 -> 530615 (+0.00%) Fill count: 613231 -> 613273 (+0.01%) Max live registers: 46559041 -> 46559087 (+0.00%) Totals from 1237 (0.14% of 905547) affected shaders: Instrs: 2262517 -> 2261203 (-0.06%); split: -0.07%, +0.01% Cycle count: 518219799 -> 517100790 (-0.22%); split: -0.59%, +0.37% Spill count: 17518 -> 17530 (+0.07%) Fill count: 32273 -> 32315 (+0.13%) Max live registers: 128360 -> 128406 (+0.04%) Ice Lake and Skylake had similar results. (Ice Lake shown) Totals: Instrs: 269849640 -> 269848198 (-0.00%); split: -0.00%, +0.00% Cycle count: 26718329643 -> 26718289020 (-0.00%); split: -0.00%, +0.00% Max live registers: 46878430 -> 46878462 (+0.00%) Totals from 1233 (0.14% of 905427) affected shaders: Instrs: 2324225 -> 2322783 (-0.06%); split: -0.06%, +0.00% Cycle count: 531467501 -> 531426878 (-0.01%); split: -0.11%, +0.10% Max live registers: 130782 -> 130814 (+0.02%) Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37186>
2026-03-17 10:20:33 +01:00 · 2025-06-16 16:28:52 -07:00 · 2025-06-16 16:28:52 -07:00 · c49d6e0480
commit c49d6e0480
parent 073ffceef6
1 changed files with 39 additions and 0 deletions
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -3635,6 +3635,45 @@ late_optimizations = [
   (('udiv_aligned_4', a), ('ushr', a, 2)),
 ]

+for int_sz in (8, 16, 32):
+    # Note: Python's float is 64-bit, so it should be able to exactly
+    # represent these values for upto 32 bits.
+    uintmax = float((1 << int_sz) - 1)
+    intmax = float((1 << (int_sz - 1)) - 1)
+    intmin = float(1 << (int_sz - 1))
+
+    # Don't generate patterns that try to emit saturating conversion from
+    # 64-bit float to 8-bit integer. These are generally not supported by any
+    # drivers.
+    all_float_sizes = (16, 32, 64) if int_sz > 8 else (16, 32)
+
+    for float_sz in all_float_sizes:
+        # The floating point type can only precisely represent the signed
+        # integer minimum or maximum if it has enough mantissa and exponent
+        # bits.
+        if float_sz > int_sz:
+            late_optimizations.extend([
+                # This requires is_a_number because f2i_sat(NaN) is zero, but
+                # fmax(intmin, NaN) is intmin.
+                ((f'f2i{int_sz}', ('fmax', f'a@{float_sz}(is_a_number)', intmin)), ('f2i{int_sz}_sat', a), 'options->has_f2i_sat'),
+
+                ((f'f2i{int_sz}', ('fmin', f'a@{float_sz}(is_a_number)', intmax)), ('f2i{int_sz}_sat', a), 'options->has_f2i_sat'),
+                ((f'f2u{int_sz}', ('fmin', f'a@{float_sz}(is_a_number)', uintmax)), (f'f2u{int_sz}_sat', a), 'options->has_f2u_sat'),
+            ])
+
+        late_optimizations.extend([
+            # This does not require is_a_number because both f2u_sat(NaN) and
+            # fmax(NaN, 0) are zero.
+            ((f'f2u{int_sz}', ('fmax', f'a@{float_sz}', 0.0)), ('f2u{int_sz}_sat', a), 'options->has_f2u_sat'),
+
+            # f2i(NaN) and f2u(NaN) are zero.
+            ((f'f2i{int_sz}', ('bcsel', ('feq', f'a@{float_sz}', a), a, 0.0)), (f'f2i{int_sz}_sat', a), 'options->has_f2i_sat'),
+            ((f'f2u{int_sz}', ('bcsel', ('feq', f'a@{float_sz}', a), a, 0.0)), (f'f2u{int_sz}_sat', a), 'options->has_f2u_sat'),
+
+            ((f'f2i{int_sz}', ('bcsel', ('fneu', f'a@{float_sz}', a), 0.0, a)), (f'f2i{int_sz}_sat', a), 'options->has_f2i_sat'),
+            ((f'f2u{int_sz}', ('bcsel', ('fneu', f'a@{float_sz}', a), 0.0, a)), (f'f2u{int_sz}_sat', a), 'options->has_f2u_sat'),
+        ])
+
 # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
 # gets combined to fma(a, b, -c).
 for sz, mulz in itertools.product([16, 32, 64], [False, True]):