diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index aae86579bc7..b4ee15eefb7 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3446,6 +3446,9 @@ typedef struct nir_shader_compiler_options { */ bool use_scoped_barrier; + /** Backend supports fmulz (and ffmaz if lower_ffma32=false) */ + bool has_fmulz; + /** * Is this the Intel vec4 backend? * diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index a104edc9882..c035c70ad9c 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -669,6 +669,20 @@ if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { dst = src0 * src1; } """) + +# Unlike fmul, anything (even infinity or NaN) multiplied by zero is always zero. +# fmulz(0.0, inf) and fmulz(0.0, nan) must be +/-0.0, even if +# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then +# the result must be a positive zero if either operand is zero. +binop("fmulz", tfloat32, _2src_commutative + associative, """ +if (src0 == 0.0 || src1 == 0.0) + dst = 0.0; +else if (nir_is_rounding_mode_rtz(execution_mode, 32)) + dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); +else + dst = src0 * src1; +""") + # low 32-bits of signed/unsigned integer multiply binop("imul", tint, _2src_commutative + associative, """ /* Use 64-bit multiplies to prevent overflow of signed arithmetic */ @@ -960,6 +974,19 @@ if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { } """) +# Unlike ffma, anything (even infinity or NaN) multiplied by zero is always zero. +# ffmaz(0.0, inf, src2) and ffmaz(0.0, nan, src2) must be +/-0.0 + src2, even if +# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then +# the result must be a positive zero plus src2 if either src0 or src1 is zero. +triop("ffmaz", tfloat32, _2src_commutative, """ +if (src0 == 0.0 || src1 == 0.0) + dst = 0.0 + src2; +else if (nir_is_rounding_mode_rtz(execution_mode, 32)) + dst = _mesa_float_fma_rtz(src0, src1, src2); +else + dst = fmaf(src0, src1, src2); +""") + triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") # Ternary addition diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index d5a9f7265bc..949e0f24278 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -330,10 +330,12 @@ optimizations.extend([ (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), + (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), + (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), @@ -2483,6 +2485,7 @@ late_optimizations = [ (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), + (('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'), # Subtractions get lowered during optimization, so we need to recombine them (('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),