diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index ecd4efc4b63..d964d7a3457 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -268,10 +268,10 @@ optimizations = [ (('fadd', a, a), ('fmul', a, 2.0)), (('fadd(contract)', a, ('fadd(is_used_once)', a, b)), ('fadd', b, ('fmul', a, 2.0))), (('~fmul', a, 0.0), 0.0), - (('~fmul', a, -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for inexactly testing -0.0 inputs + (('~fmul', a, -0.0), 0.0), # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN (('fmul(nsz,nnan)', 'a', 0.0), 0.0), - (('fmul(nsz,nnan)', 'a', -0.0), 0.0, 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs + (('fmul(nsz,nnan)', 'a', -0.0), 0.0), (('fmulz', a, 0.0), 0.0), (('fmulz', a, -0.0), 0.0), (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)), @@ -294,7 +294,7 @@ optimizations = [ (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), (('ffma(nsz,nnan)', 0.0, a, b), ('fcanonicalize', b)), - (('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs + (('ffma(nsz,nnan)', -0.0, a, b), ('fcanonicalize', b)), (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), (('ffmaz', -0.0, a, b), ('fadd', 0.0, b)), (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)), @@ -308,11 +308,11 @@ optimizations = [ (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), (('flrp(nnan,nsz)', a, b, 0.0), ('fcanonicalize', a)), - (('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs + (('flrp(nnan,nsz)', a, b, -0.0), ('fcanonicalize', a)), (('flrp(nnan,nsz)', a, b, 1.0), ('fcanonicalize', b)), (('~flrp', a, a, b), ('fcanonicalize', a)), (('flrp(nnan,nsz)', 0.0, a, b), ('fmul', a, b)), - (('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs + (('flrp(nnan,nsz)', -0.0, a, b), ('fmul', a, b)), # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), @@ -974,8 +974,8 @@ optimizations.extend([ (('fneg', ('fmin(is_used_once)', ('fneg', a), ('fneg', b))), ('fmax', a, b)), (('fneg', ('fmax(is_used_once)', ('fneg', a), '#b')), ('fmin', a, ('fneg', b))), (('fneg', ('fmin(is_used_once)', ('fneg', a), '#b')), ('fmax', a, ('fneg', b))), - (('fmin(nsz)', a, -0.0), ('fmin', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs - (('fmax(nsz)', a, -0.0), ('fmax', a, 0.0), 'true', TestStatus.UNSUPPORTED), # No support for nsz testing -0.0 inputs + (('fmin(nsz)', a, -0.0), ('fmin', a, 0.0)), + (('fmax(nsz)', a, -0.0), ('fmax', a, 0.0)), ]) for op in ['ine', 'ieq', 'ilt', 'ige', 'ult', 'uge', 'bitz', 'bitnz', @@ -3463,7 +3463,7 @@ optimizations.extend([ ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)), ('iand', a, 1 << 31), ('!f2f32', ('!f2f16_rtne', a))), - 'options->lower_fquantize2f16', TestStatus.UNSUPPORTED), # All test inputs skipped. + 'options->lower_fquantize2f16'), ]) for s in range(0, 31): @@ -4182,8 +4182,6 @@ for t in ['f', 'i']: add_used_once = '~{}add(is_used_once)'.format(t) add = '~{}add'.format(t) mul = '~{}mul'.format(t) - # With so many inputs, we always get an inf or a nan in testing. - expected = TestStatus.UNSUPPORTED if t == 'f' else TestStatus.PASS # Variable names used below were selected based on these layouts: # mat4 mat4 vec4 @@ -4203,10 +4201,10 @@ for t in ['f', 'i']: step8 = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), (mul, 'ff', 'jj')) # This finds and replaces common (mat4*mat4)*vec4 with something that will get optimised down to mat4*(mat4*vec4) - mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)] + mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), (mul, step4, 'jj')), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))] # This helps propagate the above improvement further up the mul chain e.g. mat4*mat4*mat4*vec4 to (mat4*vec4)*mat4*mat4 - mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')), 'true', expected)] + mat_mul_optimizations += [((add_first, (add, (add, (mul, 'gg', step1), (mul,'hh', step2)), (mul, 'ii', step3)), (mul, 'jj', step4)), (add, (add, (add, (mul, step5, 'a'), (mul, step6, 'b')), (mul, step7, 'c')), (mul, step8, 'd')))] # Below handles a real world shader that looks like this mat4*mat4*vec4(xyz, 1.0) where the the multiplication of the 1.0 constant has been optimised away step5_no_w_mul = (add, (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), (mul, 's', 'ii')), 't') @@ -4214,7 +4212,7 @@ for t in ['f', 'i']: step7_no_w_mul = (add, (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), (mul, 'aa', 'ii')), 'bb') step8_no_w_mul = (add, (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), (mul, 'ee', 'ii')), 'ff') - mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')), 'true', expected)] + mat_mul_optimizations += [((add_first, (add, (add, (mul, step1, 'gg'), (mul, step2, 'hh')), (mul, step3, 'ii')), step4), (add, (add, (add, (mul, step5_no_w_mul, 'a'), (mul, step6_no_w_mul, 'b')), (mul, step7_no_w_mul, 'c')), (mul, step8_no_w_mul, 'd')))] # Below handles a real world shader that looks like this mat4*mat4*vec4(xy, 0.0, 1.0) where the the multiplication of the 0.0 and 1.0 constants have been optimised away step5_zero_z_no_w_mul = (add, (add, (mul, 'q', 'gg'), (mul, 'r', 'hh')), 't') @@ -4222,7 +4220,7 @@ for t in ['f', 'i']: step7_zero_z_no_w_mul = (add, (add, (mul, 'y', 'gg'), (mul, 'z', 'hh')), 'bb') step8_zero_z_no_w_mul = (add, (add, (mul, 'cc', 'gg'), (mul, 'dd', 'hh')), 'ff') - mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')), 'true', expected)] + mat_mul_optimizations += [((add_first, (add, (mul, step1, 'gg'), step4), (mul, step2, 'hh')), (add, (add, (add, (mul, step5_zero_z_no_w_mul, 'a'), (mul, step6_zero_z_no_w_mul, 'b')), (mul, step7_zero_z_no_w_mul, 'c')), (mul, step8_zero_z_no_w_mul, 'd')))] before_lower_int64_optimizations = [ # The i2i64(a) implies that 'a' has at most 32-bits of data. diff --git a/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp b/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp index 3665d628119..4a78fc6a87c 100644 --- a/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp +++ b/src/compiler/nir/tests/nir_algebraic_pattern_test.cpp @@ -199,6 +199,37 @@ static const double float_inputs[INPUT_VALUE_COUNT] = { DBL_MIN, }; +void +nir_algebraic_pattern_test::handle_signed_zero(nir_const_value *val, uint32_t bit_size) +{ + if (bit_size < 16 || nir_const_value_as_float(*val, bit_size) != 0.0) + return; + + /* If we're preserving signed zeroes, no need to do any of this work. */ + if (exact && (fp_math_ctrl & nir_fp_preserve_signed_zero)) + return; + + if (signed_zero_iter != 0) { + if (signed_zero_count < 32 && + (1u << signed_zero_count) & signed_zero_iter) { + switch (bit_size) { + case 16: + val->u16 ^= 0x8000; + break; + case 32: + val->f32 = -val->f32; + break; + case 64: + val->f64 = -val->f64; + break; + default: + UNREACHABLE("bad bit size"); + } + } + } + signed_zero_count++; +} + bool nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size, nir_const_value tmp, int32_t src_index) @@ -206,18 +237,6 @@ nir_algebraic_pattern_test::skip_test(nir_alu_instr *alu, uint32_t bit_size, /* Always pass the test for signed zero/nan/inf sources if they are not preserved. */ if (bit_size >= 16) { double val = nir_const_value_as_float(tmp, bit_size); - if ((!exact || !(fp_math_ctrl & nir_fp_preserve_signed_zero)) && val == 0.0 && signbit(val)) { - /* TODO: Could be more permissive in covering input values -- right now - * we skip if either before or after ever consume or produce a -0.0, - * but if the result was unchanged by the 0.0 signs of the srcs, or if - * the two sides agreed about the sign of 0.0s produced, we could test - * that the rest of the expression evaluated correctly. - * - * Also, the fp preserve flags should probably not apply to non-float - * uses/outputs! - */ - return true; - } if ((!exact || !(fp_math_ctrl & nir_fp_preserve_nan)) && isnan(val)) return true; if ((!exact || !(fp_math_ctrl & nir_fp_preserve_inf)) && isinf(val)) @@ -344,8 +363,10 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr) for (uint32_t j = 0; j < nir_ssa_alu_instr_src_components(alu, i); j++) { nir_const_value tmp = tmp_value(alu->src[i].src.ssa)[alu->src[i].swizzle[j]]; - src[i][j] = tmp; + if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float) + handle_signed_zero(&tmp, alu->src[i].src.ssa->bit_size); + src[i][j] = tmp; if (skip_test(alu, alu->src[i].src.ssa->bit_size, tmp, i)) return true; } @@ -372,6 +393,8 @@ nir_algebraic_pattern_test::evaluate_expression(nir_instr *instr) return true; for (uint32_t comp = 0; comp < alu->def.num_components; comp++) { + if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float) + handle_signed_zero(&dest[comp], alu->def.bit_size); if (skip_test(alu, bit_size, dest[comp], -1)) return true; } @@ -491,26 +514,59 @@ nir_algebraic_pattern_test::validate_pattern() if (!check_variable_conds()) continue; - nir_foreach_instr(instr, block) { - bool is_assert = (instr->type == nir_instr_type_intrinsic && - nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq); - if (evaluate_expression(instr)) { - if (is_assert) { - if (result == UNSUPPORTED) - result = PASS; + /* Loop over the set of 0.0 sign flips we want to try to see if the + * pattern works that way, given the NIR spec for + * !nir_fp_preserve_signed_zero of "any -0.0 or +0.0 output can have + * either sign, and any zero input can be treated as having opposite sign. + */ + uint32_t saved_signed_zero_count = 0; + enum result seed_result = UNSUPPORTED; + for (signed_zero_iter = 0; signed_zero_iter < 1u << MIN2(4, saved_signed_zero_count); signed_zero_iter++) { + /* This will get incremented as we evaluate the instrs. */ + signed_zero_count = 0; + + /* Loop over the instructions evaluating them given the inputs and this set of signed zero flips. */ + nir_foreach_instr(instr, block) { + bool is_assert = (instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_unit_test_assert_eq); + + if (evaluate_expression(instr)) { + if (is_assert) + seed_result = PASS; + break; + } else { + if (is_assert) { + seed_result = FAIL; + } } - break; - } else if (is_assert) { - result = FAIL; } + + if (signed_zero_iter == 0) + saved_signed_zero_count = signed_zero_count; + + if (seed_result == PASS) + break; } - /* If we found a fail, break out so we can print the shader with the - * failing values. Otherwise, continue iterating over input values to - * find any broken ones. - */ - if (result == FAIL) - break; + if (seed_result == PASS) { + result = PASS; + /* Don't break out of the loop that feeds us new inputs -- we want to continue to test the rest to find a failure. */ + } else if (seed_result == UNSUPPORTED) { + /* The test skipped for these inputs, don't change the final result. */ + } else { + bool sz_non_exhaustive = saved_signed_zero_count > 31 || signed_zero_iter < (1u << saved_signed_zero_count); + if (sz_non_exhaustive) { + /* We don't seem to trigger this case in practice. */ + printf("Skipping test input due to too many signed zeroes to exhaustively test.\n"); + } else { + /* We got a fail result with every combination of + * nir_fp_preserve_signed_zero bit flips applied. Break out so we + * can print the shader with the failing values. + */ + result = FAIL; + break; + } + } } ASSERT_EQ(result, expected_result); diff --git a/src/compiler/nir/tests/nir_algebraic_pattern_test.h b/src/compiler/nir/tests/nir_algebraic_pattern_test.h index 2ffe830aee7..b9c03ce9048 100644 --- a/src/compiler/nir/tests/nir_algebraic_pattern_test.h +++ b/src/compiler/nir/tests/nir_algebraic_pattern_test.h @@ -75,12 +75,23 @@ class nir_algebraic_pattern_test : public nir_test { bool evaluate_expression(nir_instr *instr); bool skip_test(nir_alu_instr *alu, uint32_t bit_size, nir_const_value tmp, int32_t src_index); + void handle_signed_zero(nir_const_value *val, uint32_t bit_size); public: nir_const_value *tmp_value(nir_def *def); std::vector inputs; uint32_t fuzzing_bits; + + /* Iteration count for signed 0 non-preservation search -- we set up + * signed_zero_count during the first iteration, and during other iterations + * we'll flip signs to try to see if the pattern ever matches. + */ + uint32_t signed_zero_iter; + + /* Number of 0.0s encountered in the current signed_zero_iter. */ + uint32_t signed_zero_count; + bool exact = true; enum result expected_result = PASS; const char *expression_cond_failed = NULL;