diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5d810b306bb..47066d4080f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1316,6 +1316,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); } break; + case aco_opcode::v_mul_u32_u24: + ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); + break; case aco_opcode::v_and_b32: { /* abs */ if (!instr->usesModifiers() && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && @@ -2324,12 +2327,6 @@ bool combine_add_bcnt(opt_ctx& ctx, aco_ptr& instr) if (instr->usesModifiers()) return false; - /* Do not combine if the carry-out is used. */ - if ((instr->opcode == aco_opcode::v_add_co_u32 || - instr->opcode == aco_opcode::v_add_co_u32_e64) && - ctx.uses[instr->definitions[1].tempId()]) - return false; - for (unsigned i = 0; i < 2; i++) { Instruction *op_instr = follow_operand(ctx, instr->operands[i]); if (op_instr && @@ -2912,6 +2909,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr } else if (instr->opcode == aco_opcode::v_add_u32) { if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; else if (combine_add_bcnt(ctx, instr)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ; else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) { if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; @@ -2924,8 +2922,10 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr } } else if (instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_add_co_u32_e64) { + bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0; if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; - else combine_add_bcnt(ctx, instr); + else if (!carry_out && combine_add_bcnt(ctx, instr)) ; + else if (!carry_out) combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2); } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || instr->opcode == aco_opcode::v_sub_co_u32_e64) { diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 8d6805febb8..bf7b51bee69 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -723,3 +723,24 @@ BEGIN_TEST(optimize.minmax) finish_opt_test(); } END_TEST + +BEGIN_TEST(optimize.mad_32_24) + for (unsigned i = GFX8; i <= GFX9; i++) { + //>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 v1", (chip_class)i)) + continue; + + //! v1: %res0 = v_mad_u32_u24 %b, %c, %a + //! p_unit_test 0, %res0 + Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); + writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul)); + + //! v1: %res1_tmp = v_mul_u32_u24 %b, %c + //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp + //! p_unit_test 1, %res1 + mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); + writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp()); + + finish_opt_test(); + } +END_TEST