From 91ffeed88a4eccfb92bd1cc4a5a3169129128432 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 7 Oct 2020 11:40:45 +0100 Subject: [PATCH] aco: fix combine_constant_comparison_ordering() NaN check with 16/64-bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Cc: mesa-stable Part-of: --- src/amd/compiler/aco_optimizer.cpp | 46 ++++++++++------ src/amd/compiler/tests/test_optimizer.cpp | 66 +++++++++++++++++++++++ 2 files changed, 96 insertions(+), 16 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 4add86b4340..3160531aaed 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1804,6 +1804,31 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) return true; } +bool is_operand_constant(opt_ctx &ctx, Operand op, unsigned bit_size, uint64_t *value) +{ + if (op.isConstant()) { + *value = op.constantValue64(); + return true; + } else if (op.isTemp()) { + unsigned id = original_temp_id(ctx, op.getTemp()); + if (!ctx.info[id].is_constant_or_literal(bit_size)) + return false; + *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64(); + return true; + } + return false; +} + +bool is_constant_nan(uint64_t value, unsigned bit_size) +{ + if (bit_size == 16) + return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff); + else if (bit_size == 32) + return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff); + else + return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff); +} + /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b) * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) @@ -1829,7 +1854,8 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in else if (get_f32_cmp(nan_test->opcode) != expected_nan_test) return false; - if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode)) + unsigned bit_size = get_cmp_bitsize(cmp->opcode); + if (!is_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size) return false; if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp()) @@ -1858,22 +1884,10 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in if (constant_operand == -1) return false; - uint32_t constant; - if (cmp->operands[constant_operand].isConstant()) { - constant = cmp->operands[constant_operand].constantValue(); - } else if (cmp->operands[constant_operand].isTemp()) { - Temp tmp = cmp->operands[constant_operand].getTemp(); - unsigned id = original_temp_id(ctx, tmp); - if (!ctx.info[id].is_constant_or_literal(32)) - return false; - constant = ctx.info[id].val; - } else { + uint64_t constant_value; + if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value)) return false; - } - - float constantf; - memcpy(&constantf, &constant, 4); - if (isnan(constantf)) + if (is_constant_nan(constant_value, bit_size)) return false; if (cmp->operands[0].isTemp()) diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 2275d60cbda..94886e0bf93 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -299,3 +299,69 @@ BEGIN_TEST(optimize.clamp) finish_opt_test(); END_TEST + +BEGIN_TEST(optimize.const_comparison_ordering) + //>> v1: %a, v1: %b, v2: %c, v1: %d, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 v2 v1", GFX9)) + return; + + /* optimize to unordered comparison */ + //! s2: %res0 = v_cmp_nge_f32 4.0, %a + //! p_unit_test 0, %res0 + writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]))); + + //! s2: %res1 = v_cmp_nge_f32 4.0, %a + //! p_unit_test 1, %res1 + writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]))); + + //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a + //! p_unit_test 2, %res2 + writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0]))); + + /* optimize to ordered comparison */ + //! s2: %res3 = v_cmp_lt_f32 4.0, %a + //! p_unit_test 3, %res3 + writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]))); + + //! s2: %res4 = v_cmp_lt_f32 4.0, %a + //! p_unit_test 4, %res4 + writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0]))); + + //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a + //! p_unit_test 5, %res5 + writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0]))); + + /* NaN */ + uint16_t nan16 = 0x7e00; + uint32_t nan32 = 0x7fc00000; + + //! s2: %tmp6_0 = v_cmp_lt_f16 0x7e00, %a + //! s2: %tmp6_1 = v_cmp_neq_f16 %a, %a + //! s2: %res6, s1: %_:scc = s_or_b64 %tmp6_1, %tmp6_0 + //! p_unit_test 6, %res6 + writeout(6, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand(nan16), inputs[0]))); + + //! s2: %tmp7_0 = v_cmp_lt_f32 0x7fc00000, %a + //! s2: %tmp7_1 = v_cmp_neq_f32 %a, %a + //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0 + //! p_unit_test 7, %res7 + writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), + bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), + bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(nan32), inputs[0]))); + + finish_opt_test(); +END_TEST