nir/algebraic: optimize expressions using fmulz/ffmaz

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13436>
This commit is contained in:
Rhys Perry 2021-09-14 18:02:01 +01:00 committed by Marge Bot
parent 14b8227083
commit 495debebad
3 changed files with 75 additions and 14 deletions

View file

@ -146,10 +146,15 @@ optimizations = [
(('usadd_4x8_vc4', a, 0), a),
(('usadd_4x8_vc4', a, ~0), ~0),
(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
(('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
(('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
(('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
(('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
(('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
(('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
(('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
(('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
(('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
(('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
(('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
@ -164,10 +169,18 @@ optimizations = [
# The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
(('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),
(('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),
(('fmulz', a, 0.0), 0.0),
(('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
(('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
(('fmulz', a, a), ('fmul', a, a)),
(('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32),
(('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
(('ffmaz', a, a, b), ('ffma', a, a, b)),
(('imul', a, 0), 0),
(('umul_unorm_4x8_vc4', a, 0), 0),
(('umul_unorm_4x8_vc4', a, ~0), a),
(('~fmul', a, 1.0), a),
(('~fmulz', a, 1.0), a),
# The only effect a*1.0 can have is flushing denormals. If it's only used by
# a floating point instruction, they should flush any input denormals and
# this multiplication isn't needed.
@ -184,12 +197,17 @@ optimizations = [
(('~ffma', 0.0, a, b), b),
(('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),
(('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),
(('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
(('~ffma', a, b, 0.0), ('fmul', a, b)),
(('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16),
(('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
(('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32),
(('ffma', 1.0, a, b), ('fadd', a, b)),
(('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32),
(('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
(('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32),
(('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
(('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
(('~flrp', a, b, 0.0), a),
(('~flrp', a, b, 1.0), b),
(('~flrp', a, a, b), a),
@ -769,6 +787,7 @@ optimizations.extend([
(('fsat', ('fsat', a)), ('fsat', a)),
(('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
(('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
(('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32),
(('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
@ -1228,6 +1247,7 @@ optimizations.extend([
(('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
(('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
(('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
(('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
# Division and reciprocal
(('~fdiv', 1.0, a), ('frcp', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
@ -1546,29 +1566,42 @@ optimizations.extend([
# Propagate negation up multiplication chains
(('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
(('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32),
(('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
(('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
(('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
# Propagate constants up multiplication chains
(('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
(('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
(('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
(('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
(('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
(('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
(('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
# Prefer moving out a multiplication for more MAD/FMA-friendly code
(('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
(('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
(('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
(('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
(('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
# Reassociate constants in add/mul chains so they can be folded together.
# For now, we mostly only handle cases where the constants are separated by
# a single non-constant. We could do better eventually.
(('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
(('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
(('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
(('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
(('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
(('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
(('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
(('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),
(('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
(('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))),
(('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
(('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))),
(('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
(('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
(('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
(('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)),
@ -1593,6 +1626,8 @@ optimizations.extend([
(('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
(('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
(('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
(('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
(('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
(('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
@ -2297,7 +2332,7 @@ for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
# mix(1, (a-1)+1, condition)
#
# Other optimizations will rearrange the constants.
for op in ['fadd', 'fmul', 'iadd', 'imul']:
for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
optimizations += [
((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
]
@ -2339,7 +2374,7 @@ for op in ['flrp']:
(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
]
for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
optimizations += [
(('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
(('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
@ -2622,6 +2657,13 @@ late_optimizations = [
(('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
(('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
(('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
(('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
# Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
#
# If bits is zero, the result will be zero.
@ -2716,7 +2758,7 @@ for op in ['fadd']:
(('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
]
for op in ['ffma']:
for op in ['ffma', 'ffmaz']:
late_optimizations += [
(('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),

View file

@ -858,7 +858,8 @@ analyze_expression(const nir_alu_instr *instr, unsigned src,
break;
}
case nir_op_fmul: {
case nir_op_fmul:
case nir_op_fmulz: {
const struct ssa_result_range left =
analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0));
const struct ssa_result_range right =
@ -878,14 +879,19 @@ analyze_expression(const nir_alu_instr *instr, unsigned src,
} else
r.range = fmul_table[left.range][right.range];
/* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both
* operands are numbers and either both are finite or one is finite and
* the other cannot be zero, then the result must be a number.
*/
r.is_a_number = (left.is_a_number && right.is_a_number) &&
((left.is_finite && right.is_finite) ||
(!is_not_zero(left.range) && right.is_finite) ||
(left.is_finite && !is_not_zero(right.range)));
if (alu->op == nir_op_fmul) {
/* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both
* operands are numbers and either both are finite or one is finite and
* the other cannot be zero, then the result must be a number.
*/
r.is_a_number = (left.is_a_number && right.is_a_number) &&
((left.is_finite && right.is_finite) ||
(!is_not_zero(left.range) && right.is_finite) ||
(left.is_finite && !is_not_zero(right.range)));
} else {
/* nir_op_fmulz: unlike nir_op_fmul, 0 * ±Inf is a number. */
r.is_a_number = left.is_a_number && right.is_a_number;
}
break;
}
@ -1466,6 +1472,7 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,
case nir_op_ubfe:
case nir_op_bfm:
case nir_op_fmul:
case nir_op_fmulz:
case nir_op_extract_u8:
case nir_op_extract_i8:
case nir_op_extract_u16:
@ -1584,6 +1591,7 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,
}
break;
case nir_op_fmul:
case nir_op_fmulz:
/* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */
if (src0 < 0x7f800000u && src1 < 0x7f800000u) {
float src0_f, src1_f;

View file

@ -288,7 +288,7 @@ is_not_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
if (src_alu->op == nir_op_fneg)
return is_not_fmul(ht, src_alu, 0, 0, NULL);
return src_alu->op != nir_op_fmul;
return src_alu->op != nir_op_fmul && src_alu->op != nir_op_fmulz;
}
static inline bool
@ -304,7 +304,7 @@ is_fmul(struct hash_table *ht, const nir_alu_instr *instr, unsigned src,
if (src_alu->op == nir_op_fneg)
return is_fmul(ht, src_alu, 0, 0, NULL);
return src_alu->op == nir_op_fmul;
return src_alu->op == nir_op_fmul || src_alu->op == nir_op_fmulz;
}
static inline bool
@ -499,6 +499,17 @@ is_finite(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
return v.is_finite;
}
static inline bool
is_finite_not_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
unsigned src, UNUSED unsigned num_components,
UNUSED const uint8_t *swizzle)
{
const struct ssa_result_range v = nir_analyze_range(ht, instr, src);
return v.is_finite &&
(v.range == lt_zero || v.range == gt_zero || v.range == ne_zero);
}
#define RELATION(r) \
static inline bool \