nir: fuse ffma even with float controls

The fmul+fadd -> fma rules in nir_opt_algebraic are marked imprecise, because they are a contraction. However, they respect signed zero/Inf/NaN rules. As such, it is legal to do this fusion with shader float controls as long as the exact bit is not set (mapping to SPIR-V NoContract). Unfortunately, NIR's imprecise rules do not distinguish between contraction issues versus float special case issues, forcing nir_search to skip all imprecise rules when any shader float control modes are used. This notably affects DXVK, which sets shader float controls to get D3D11 float behaviour and hence loses FMA fusing. Therefore, we plumb in the exact bit to express NoContract independent of the float controls, and weaken the requirement for fma fusion to allowable contraction. For fma splitting, it's a similar issue, as inexact GLSL fma in SPIR-V is just a multiply add that we're allowed to contract rather than the real deal. Drivers that use their own FMA fusing passes (notably, Intel and AMD) are unaffected, but DXVK-capable drivers using fuse_ffma should like this. Results on hk shown: Totals from 2194 (4.06% of 54019) affected shaders: MaxWaves: 2174272 -> 2175936 (+0.08%); split: +0.08%, -0.01% Instrs: 1173283 -> 1131494 (-3.56%); split: -3.57%, +0.01% CodeSize: 8568168 -> 8381724 (-2.18%); split: -2.18%, +0.01% Spills: 1094 -> 747 (-31.72%) Fills: 988 -> 681 (-31.07%) Scratch: 4444 -> 3820 (-14.04%) ALU: 953032 -> 913149 (-4.18%); split: -4.19%, +0.01% FSCIB: 953032 -> 913149 (-4.18%); split: -4.19%, +0.01% IC: 215398 -> 215274 (-0.06%) GPRs: 139865 -> 139032 (-0.60%); split: -1.56%, +0.96% Uniforms: 414886 -> 414466 (-0.10%); split: -0.14%, +0.04% Preamble instrs: 646398 -> 644017 (-0.37%); split: -0.43%, +0.07% Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35989>
2026-05-05 18:18:06 +02:00 · 2025-07-06 19:45:27 -04:00 · 2025-07-06 19:45:27 -04:00 · 2765017553
commit 2765017553
parent 2c51a8870d
4 changed files with 11 additions and 6 deletions
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@ -214,6 +214,7 @@ class Value(object):
      ${'true' if val.nsz else 'false'},
      ${'true' if val.nnan else 'false'},
      ${'true' if val.ninf else 'false'},
+      ${'true' if val.contract else 'false'},
      ${val.swizzle},
      ${val.c_opcode()},
      ${val.comm_expr_idx}, ${val.comm_exprs},
@ -393,6 +394,7 @@ class Expression(Value):
      self.nsz = cond.pop('nsz', False)
      self.nnan = cond.pop('nnan', False)
      self.ninf = cond.pop('ninf', False)
+      self.contract = cond.pop('contract', False)
      self.swizzle = -1 if m.group('swizzle') is None else swizzles[m.group('swizzle').removeprefix('.')]

      assert len(cond) <= 1
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -485,10 +485,10 @@ optimizations.extend([
   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
-   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
-   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
-   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
-   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
+   (('ffma@16(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
+   (('ffma@32(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
+   (('ffma@64(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
+   (('ffmaz(contract)', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),

   (('~fmul', ('fadd', ('bcsel', a, ('fmul', b, c), 0), '#d'), '#e'),
    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
@ -3563,7 +3563,7 @@ for sz, mulz in itertools.product([16, 32, 64], [False, True]):
    fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)'
    ffma = 'ffmaz' if mulz else 'ffma'

-    fadd = '~fadd@{}'.format(sz)
+    fadd = 'fadd@{}(contract)'.format(sz)
    option = 'options->fuse_ffma{}'.format(sz)

    late_optimizations.extend([
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@ -386,7 +386,7 @@ match_expression(const nir_algebraic_table *table, const nir_search_expression *
       instr->def.bit_size != expr->value.bit_size)
      return false;

-   state->inexact_match = expr->inexact || state->inexact_match;
+   state->inexact_match = expr->inexact || expr->contract || state->inexact_match;
   state->has_exact_alu = (instr->exact && !expr->ignore_exact) || state->has_exact_alu;
   if (state->inexact_match && state->has_exact_alu)
      return false;
--- a/src/compiler/nir/nir_search.h
+++ b/src/compiler/nir/nir_search.h
@ -148,6 +148,9 @@ typedef struct {
   /** Replacement does not preserve infinities. */
   bool ninf : 1;

+   /** Replacement contracts an expression */
+   bool contract : 1;
+
   /** Whether the use of the instruction should have a swizzle. */
   int16_t swizzle : 5;