diff --git a/.pick_status.json b/.pick_status.json index f45a9d42634..0a42a22781b 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -238,7 +238,7 @@ "description": "aco: don't combine precise max(min()) to med3", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "master_sha": null, "because_sha": null }, diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index aa9fad589b3..067dc5c5a94 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1958,7 +1958,8 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap, const char *shuffle_str, Operand operands[3], bool neg[3], bool abs[3], uint8_t *opsel, bool *op1_clamp, uint8_t *op1_omod, - bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel) + bool *inbetween_neg, bool *inbetween_abs, bool *inbetween_opsel, + bool *precise) { /* checks */ if (op1_instr->opcode != op1) @@ -1999,6 +2000,9 @@ bool match_op3_for_vop3(opt_ctx &ctx, aco_opcode op1, aco_opcode op2, else if (op1_vop3 && op1_vop3->opsel & (1 << swap)) return false; + *precise = op1_instr->definitions[0].isPrecise() || + op2_instr->definitions[0].isPrecise(); + int shuffle[3]; shuffle[shuffle_str[0] - '0'] = 0; shuffle[shuffle_str[1] - '0'] = 1; @@ -2051,12 +2055,12 @@ bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode continue; Operand operands[3]; - bool neg[3], abs[3], clamp; + bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg, abs, &opsel, - &clamp, &omod, NULL, NULL, NULL)) { + &clamp, &omod, NULL, NULL, NULL, &precise)) { ctx.uses[instr->operands[swap].tempId()]--; create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod); return true; @@ -2074,13 +2078,13 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposi * max(-min(a, b), c) -> max3(-a, -b, c) */ for (unsigned swap = 0; swap < 2; swap++) { Operand operands[3]; - bool neg[3], abs[3], clamp; + bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; bool inbetween_neg; if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg, abs, &opsel, - &clamp, &omod, &inbetween_neg, NULL, NULL) && + &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) && inbetween_neg) { ctx.uses[instr->operands[swap].tempId()]--; neg[1] = true; @@ -2320,11 +2324,17 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, for (unsigned swap = 0; swap < 2; swap++) { Operand operands[3]; - bool neg[3], abs[3], clamp; + bool neg[3], abs[3], clamp, precise; uint8_t opsel = 0, omod = 0; if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg, abs, &opsel, - &clamp, &omod, NULL, NULL, NULL)) { + &clamp, &omod, NULL, NULL, NULL, &precise)) { + /* max(min(src, upper), lower) returns upper if src is NaN, but + * med3(src, lower, upper) returns lower. + */ + if (precise && instr->opcode != min) + continue; + int const0_idx = -1, const1_idx = -1; uint32_t const0 = 0, const1 = 0; for (int i = 0; i < 3; i++) { diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 4bb5898e236..d484b98bbb4 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -122,3 +122,38 @@ BEGIN_TEST(optimize.cndmask) finish_opt_test(); } END_TEST + +BEGIN_TEST(optimize.clamp) + //>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 v1", GFX9)) + return; + + //! v1: %res0 = v_med3_f32 4.0, 0, %a + //! p_unit_test 0, %res0 + writeout(0, bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), + bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0]))); + + //! v1: %res1 = v_med3_f32 0, 4.0, %a + //! p_unit_test 1, %res1 + writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), + bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0]))); + + /* correct NaN behaviour with precise */ + + //! v1: %res2 = v_med3_f32 4.0, 0, %a + //! p_unit_test 2, %res2 + Builder::Result max = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0]); + max.def(0).setPrecise(true); + Builder::Result min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), max); + max.def(0).setPrecise(true); + writeout(2, min); + + //! v1: (precise)%res3_tmp = v_min_f32 4.0, %a + //! v1: %res3 = v_max_f32 0, %res3_tmp + //! p_unit_test 3, %res3 + min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0]); + min.def(0).setPrecise(true); + writeout(3, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), min)); + + finish_opt_test(); +END_TEST