nir: fuse ffma even with float controls

The fmul+fadd -> fma rules in nir_opt_algebraic are marked imprecise,
because they are a contraction. However, they respect signed zero/Inf/NaN rules.
As such, it is legal to do this fusion with shader float controls as long as the
exact bit is not set (mapping to SPIR-V NoContract).

Unfortunately, NIR's imprecise rules do not distinguish between contraction
issues versus float special case issues, forcing nir_search to skip all
imprecise rules when any shader float control modes are used. This notably
affects DXVK, which sets shader float controls to get D3D11 float behaviour and
hence loses FMA fusing.

Therefore, we plumb in the exact bit to express NoContract independent of the
float controls, and weaken the requirement for fma fusion to allowable
contraction. For fma splitting, it's a similar issue, as inexact GLSL fma in
SPIR-V is just a multiply add that we're allowed to contract rather than the
real deal.

Drivers that use their own FMA fusing passes (notably, Intel and AMD) are
unaffected, but DXVK-capable drivers using fuse_ffma should like this. Results
on hk shown:

Totals from 2194 (4.06% of 54019) affected shaders:
MaxWaves: 2174272 -> 2175936 (+0.08%); split: +0.08%, -0.01%
Instrs: 1173283 -> 1131494 (-3.56%); split: -3.57%, +0.01%
CodeSize: 8568168 -> 8381724 (-2.18%); split: -2.18%, +0.01%
Spills: 1094 -> 747 (-31.72%)
Fills: 988 -> 681 (-31.07%)
Scratch: 4444 -> 3820 (-14.04%)
ALU: 953032 -> 913149 (-4.18%); split: -4.19%, +0.01%
FSCIB: 953032 -> 913149 (-4.18%); split: -4.19%, +0.01%
IC: 215398 -> 215274 (-0.06%)
GPRs: 139865 -> 139032 (-0.60%); split: -1.56%, +0.96%
Uniforms: 414886 -> 414466 (-0.10%); split: -0.14%, +0.04%
Preamble instrs: 646398 -> 644017 (-0.37%); split: -0.43%, +0.07%

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35989>
This commit is contained in:
Alyssa Rosenzweig 2025-07-06 19:45:27 -04:00 committed by Marge Bot
parent 2c51a8870d
commit 2765017553
4 changed files with 11 additions and 6 deletions

View file

@ -214,6 +214,7 @@ class Value(object):
${'true' if val.nsz else 'false'},
${'true' if val.nnan else 'false'},
${'true' if val.ninf else 'false'},
${'true' if val.contract else 'false'},
${val.swizzle},
${val.c_opcode()},
${val.comm_expr_idx}, ${val.comm_exprs},
@ -393,6 +394,7 @@ class Expression(Value):
self.nsz = cond.pop('nsz', False)
self.nnan = cond.pop('nnan', False)
self.ninf = cond.pop('ninf', False)
self.contract = cond.pop('contract', False)
self.swizzle = -1 if m.group('swizzle') is None else swizzles[m.group('swizzle').removeprefix('.')]
assert len(cond) <= 1

View file

@ -485,10 +485,10 @@ optimizations.extend([
(('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
(('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
# Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
(('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
(('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
(('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
(('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
(('ffma@16(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
(('ffma@32(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
(('ffma@64(contract)', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
(('ffmaz(contract)', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
(('~fmul', ('fadd', ('bcsel', a, ('fmul', b, c), 0), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
@ -3563,7 +3563,7 @@ for sz, mulz in itertools.product([16, 32, 64], [False, True]):
fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)'
ffma = 'ffmaz' if mulz else 'ffma'
fadd = '~fadd@{}'.format(sz)
fadd = 'fadd@{}(contract)'.format(sz)
option = 'options->fuse_ffma{}'.format(sz)
late_optimizations.extend([

View file

@ -386,7 +386,7 @@ match_expression(const nir_algebraic_table *table, const nir_search_expression *
instr->def.bit_size != expr->value.bit_size)
return false;
state->inexact_match = expr->inexact || state->inexact_match;
state->inexact_match = expr->inexact || expr->contract || state->inexact_match;
state->has_exact_alu = (instr->exact && !expr->ignore_exact) || state->has_exact_alu;
if (state->inexact_match && state->has_exact_alu)
return false;

View file

@ -148,6 +148,9 @@ typedef struct {
/** Replacement does not preserve infinities. */
bool ninf : 1;
/** Replacement contracts an expression */
bool contract : 1;
/** Whether the use of the instruction should have a swizzle. */
int16_t swizzle : 5;