nir/opt_algebraic_tests: Test !nir_fp_preserve_signed_zero behavior.

Iterate over a set of sign-flips for 0.0s to see if we can find a set that
makes the search and replace sides match.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39369>
This commit is contained in:
Emma Anholt 2026-01-16 22:27:01 -08:00 committed by Marge Bot
parent d9eb9a3eef
commit 7fd0287a89
3 changed files with 108 additions and 43 deletions

View file

@ -268,10 +268,10 @@ optimizations = [
(('fadd', a, a), ('fmul', a, 2.0)),
(('fadd(contract)', a, ('fadd(is_used_once)', a, b)), ('fadd', b, ('fmul', a, 2.0))),
(('~fmul', a, 0.0), 0.0),
(('~fmul', a, -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for inexactly testing -0.0 inputs
(('~fmul', a, -0.0), 0.0),
# The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
(('fmul(nsz,nnan)', 'a', 0.0), 0.0),
(('fmul(nsz,nnan)', 'a', -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('fmul(nsz,nnan)', 'a', -0.0), 0.0),
(('fmulz', a, 0.0), 0.0),
(('fmulz', a, -0.0), 0.0),
(('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)),
@ -294,7 +294,7 @@ optimizations = [
(('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
(('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
(('ffma(nsz,nnan)', 0.0, a, b), ('fcanonicalize', b)),
(('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b)),
(('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
(('ffmaz', -0.0, a, b), ('fadd', 0.0, b)),
(('ffma(nsz)', a, b, 0.0), ('fmul', a, b)),
@ -308,11 +308,11 @@ optimizations = [
(('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
(('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
(('flrp(nnan,nsz)', a, b, 0.0), ('fcanonicalize', a)),
(('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a)),
(('flrp(nnan,nsz)', a, b, 1.0), ('fcanonicalize', b)),
(('~flrp', a, a, b), ('fcanonicalize', a)),
(('flrp(nnan,nsz)', 0.0, a, b), ('fmul', a, b)),
(('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b)),
# flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
(('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
@ -974,8 +974,8 @@ optimizations.extend([
(('fneg', ('fmin(is_used_once)', ('fneg', a), ('fneg', b))), ('fmax', a, b)),
(('fneg', ('fmax(is_used_once)', ('fneg', a), '#b')), ('fmin', a, ('fneg', b))),
(('fneg', ('fmin(is_used_once)', ('fneg', a), '#b')), ('fmax', a, ('fneg', b))),
(('fmin(nsz)', a, -0.0), ('fmin', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('fmax(nsz)', a, -0.0), ('fmax', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs
(('fmin(nsz)', a, -0.0), ('fmin', a, 0.0)),
(('fmax(nsz)', a, -0.0), ('fmax', a, 0.0)),
])
for op in ['ine', 'ieq', 'ilt', 'ige', 'ult', 'uge', 'bitz', 'bitnz',
@ -3463,7 +3463,7 @@ optimizations.extend([
('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)),
('iand', a, 1 << 31),
('!f2f32', ('!f2f16_rtne', a))),
'options->lower_fquantize2f16', TestStatus.UNSUPPORTED), # All test inputs skipped.
'options->lower_fquantize2f16'),
])
for s in range(0, 31):
@ -4182,8 +4182,6 @@ for t in ['f', 'i']:
add_used_once = '~{}add(is_used_once)'.format(t)
add = '~{}add'.format(t)
mul = '~{}mul'.format(t)
# With so many inputs, we always get an inf or a nan in testing.
expected = TestStatus.UNSUPPORTED if t == 'f' else TestStatus.PASS
# Variable names used below were selected based on these layouts:
# mat4 mat4 vec4
@ -4203,10 +4201,10 @@ for t in ['f', 'i']:
step8 = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), (mul, 'ff', 'jj'))
# This finds and replaces common (mat4*mat4)*vec4 with something that will get optimised down to mat4*(mat4*vec4)
mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)]
mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))]
# This helps propagate the above improvement further up the mul chain e.g. mat4*mat4*mat4*vec4 to (mat4*vec4)*mat4*mat4
mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)]
mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))]
# Below handles a real world shader that looks like this mat4*mat4*vec4(xyz, 1.0) where the the multiplication of the 1.0 constant has been optimised away
step5_no_w_mul = (add, (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), (mul, 's', 'ii')), 't')
@ -4214,7 +4212,7 @@ for t in ['f', 'i']:
step7_no_w_mul = (add, (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), (mul, 'aa', 'ii')), 'bb')
step8_no_w_mul = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), 'ff')
mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')), 'true', expected)]
mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')))]
# Below handles a real world shader that looks like this mat4*mat4*vec4(xy, 0.0, 1.0) where the the multiplication of the 0.0 and 1.0 constants have been optimised away
step5_zero_z_no_w_mul = (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), 't')
@ -4222,7 +4220,7 @@ for t in ['f', 'i']:
step7_zero_z_no_w_mul = (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), 'bb')
step8_zero_z_no_w_mul = (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), 'ff')
mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')), 'true', expected)]
mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')))]
before_lower_int64_optimizations = [
# The i2i64(a) implies that 'a' has at most 32-bits of data.

View file

@ -199,6 +199,37 @@ static const double float_inputs[INPUT_VALUE_COUNT] = {
DBL_MIN,
};
void
nir_algebraic_pattern_test::handle_signed_zero(nir_const_value *val, uint32_t bit_size)
{
if (bit_size < 16 || nir_const_value_as_float(*val, bit_size) != 0.0)
return;
/* If we're preserving signed zeroes, no need to do any of this work. */
if (exact && (fp_math_ctrl & nir_fp_preserve_signed_zero))
return;
if (signed_zero_iter != 0) {
if (signed_zero_count < 32 &&
(1u << signed_zero_count) & signed_zero_iter) {
switch (bit_size) {
case 16:
val->u16 ^= 0x8000;
break;
case 32:
val->f32 = -val->f32;
break;
case 64:
val->f64 = -val->f64;
break;
default:
UNREACHABLE("bad bit size");
}
}
}
signed_zero_count++;
}
bool
nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size,
nir_const_value tmp, int32_t src_index)
@ -206,18 +237,6 @@ nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size,
/* Always pass the test for signed zero/nan/inf sources if they are not preserved. */
if (bit_size >= 16) {
double val = nir_const_value_as_float(tmp, bit_size);
if ((!exact || !(fp_math_ctrl & nir_fp_preserve_signed_zero)) && val == 0.0 && signbit(val)) {
/* TODO: Could be more permissive in covering input values -- right now
* we skip if either before or after ever consume or produce a -0.0,
* but if the result was unchanged by the 0.0 signs of the srcs, or if
* the two sides agreed about the sign of 0.0s produced, we could test
* that the rest of the expression evaluated correctly.
*
* Also, the fp preserve flags should probably not apply to non-float
* uses/outputs!
*/
return true;
}
if ((!exact || !(fp_math_ctrl & nir_fp_preserve_nan)) && isnan(val))
return true;
if ((!exact || !(fp_math_ctrl & nir_fp_preserve_inf)) && isinf(val))
@ -344,8 +363,10 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr)
for (uint32_t j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) {
nir_const_value tmp = tmp_value(alu->src[i].src.ssa)[alu->src[i].swizzle[j]];
src[i][j] = tmp;
if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float)
handle_signed_zero(&tmp, alu->src[i].src.ssa->bit_size);
src[i][j] = tmp;
if (skip_test(alu, alu->src[i].src.ssa->bit_size, tmp, i))
return true;
}
@ -372,6 +393,8 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr)
return true;
for (uint32_t comp = 0; comp < alu->def.num_components; comp++) {
if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float)
handle_signed_zero(&dest[comp], alu->def.bit_size);
if (skip_test(alu, bit_size, dest[comp], -1))
return true;
}
@ -491,26 +514,59 @@ nir_algebraic_pattern_test::validate_pattern()
if (!check_variable_conds())
continue;
nir_foreach_instr(instr, block) {
bool is_assert = (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq);
if (evaluate_expression(instr)) {
if (is_assert) {
if (result == UNSUPPORTED)
result = PASS;
/* Loop over the set of 0.0 sign flips we want to try to see if the
* pattern works that way, given the NIR spec for
* !nir_fp_preserve_signed_zero of "any -0.0 or +0.0 output can have
* either sign, and any zero input can be treated as having opposite sign.
*/
uint32_t saved_signed_zero_count = 0;
enum result seed_result = UNSUPPORTED;
for (signed_zero_iter = 0; signed_zero_iter < 1u << MIN2(4, saved_signed_zero_count); signed_zero_iter++) {
/* This will get incremented as we evaluate the instrs. */
signed_zero_count = 0;
/* Loop over the instructions evaluating them given the inputs and this set of signed zero flips. */
nir_foreach_instr(instr, block) {
bool is_assert = (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq);
if (evaluate_expression(instr)) {
if (is_assert)
seed_result = PASS;
break;
} else {
if (is_assert) {
seed_result = FAIL;
}
}
break;
} else if (is_assert) {
result = FAIL;
}
if (signed_zero_iter == 0)
saved_signed_zero_count = signed_zero_count;
if (seed_result == PASS)
break;
}
/* If we found a fail, break out so we can print the shader with the
* failing values. Otherwise, continue iterating over input values to
* find any broken ones.
*/
if (result == FAIL)
break;
if (seed_result == PASS) {
result = PASS;
/* Don't break out of the loop that feeds us new inputs -- we want to continue to test the rest to find a failure. */
} else if (seed_result == UNSUPPORTED) {
/* The test skipped for these inputs, don't change the final result. */
} else {
bool sz_non_exhaustive = saved_signed_zero_count > 31 || signed_zero_iter < (1u << saved_signed_zero_count);
if (sz_non_exhaustive) {
/* We don't seem to trigger this case in practice. */
printf("Skipping test input due to too many signed zeroes to exhaustively test.\n");
} else {
/* We got a fail result with every combination of
* nir_fp_preserve_signed_zero bit flips applied. Break out so we
* can print the shader with the failing values.
*/
result = FAIL;
break;
}
}
}
ASSERT_EQ(result, expected_result);

View file

@ -75,12 +75,23 @@ class nir_algebraic_pattern_test : public nir_test {
bool evaluate_expression(nir_instr *instr);
bool skip_test(nir_alu_instr *alu, uint32_t bit_size,
nir_const_value tmp, int32_t src_index);
void handle_signed_zero(nir_const_value *val, uint32_t bit_size);
public:
nir_const_value *tmp_value(nir_def *def);
std::vector<nir_algebraic_pattern_test_input> inputs;
uint32_t fuzzing_bits;
/* Iteration count for signed 0 non-preservation search -- we set up
* signed_zero_count during the first iteration, and during other iterations
* we'll flip signs to try to see if the pattern ever matches.
*/
uint32_t signed_zero_iter;
/* Number of 0.0s encountered in the current signed_zero_iter. */
uint32_t signed_zero_count;
bool exact = true;
enum result expected_result = PASS;
const char *expression_cond_failed = NULL;