diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 4557fd0ff47..2043990fc79 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -4880,38 +4880,49 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) continue; ssa_info info = ctx.info[instr->operands[i].tempId()]; + if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags) + continue; + aco_opcode swapped_op; - if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags && - (i == 0 || can_swap_operands(instr, &swapped_op)) && - can_use_DPP(instr, true, info.is_dpp8()) && !instr->isDPP()) { - bool dpp8 = info.is_dpp8(); - convert_to_DPP(instr, dpp8); - if (dpp8) { - DPP8_instruction* dpp = &instr->dpp8(); - for (unsigned j = 0; j < 8; ++j) - dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; - if (i) { - instr->opcode = swapped_op; - std::swap(instr->operands[0], instr->operands[1]); - } - } else { - DPP16_instruction* dpp = &instr->dpp16(); - if (i) { - instr->opcode = swapped_op; - std::swap(instr->operands[0], instr->operands[1]); - std::swap(dpp->neg[0], dpp->neg[1]); - std::swap(dpp->abs[0], dpp->abs[1]); - } - dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; - dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; - dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0]; - dpp->abs[0] |= info.instr->dpp16().abs[0]; + if (i != 0 && !can_swap_operands(instr, &swapped_op)) + continue; + + if (instr->isDPP() || !can_use_DPP(instr, true, info.is_dpp8())) + continue; + + bool dpp8 = info.is_dpp8(); + bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] && + instr_info.operand_size[(int)instr->opcode] == 32; + if (!dpp8 && (info.instr->dpp16().neg[0] || info.instr->dpp16().abs[0]) && !input_mods) + continue; + + convert_to_DPP(instr, dpp8); + if (dpp8) { + DPP8_instruction* dpp = &instr->dpp8(); + for (unsigned j = 0; j < 8; ++j) + dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; + if (i) { + instr->opcode = swapped_op; + std::swap(instr->operands[0], instr->operands[1]); } - if (--ctx.uses[info.instr->definitions[0].tempId()]) - ctx.uses[info.instr->operands[0].tempId()]++; - instr->operands[0].setTemp(info.instr->operands[0].getTemp()); - break; + } else { + DPP16_instruction* dpp = &instr->dpp16(); + if (i) { + instr->opcode = swapped_op; + std::swap(instr->operands[0], instr->operands[1]); + std::swap(dpp->neg[0], dpp->neg[1]); + std::swap(dpp->abs[0], dpp->abs[1]); + } + dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; + dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; + dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0]; + dpp->abs[0] |= info.instr->dpp16().abs[0]; } + + if (--ctx.uses[info.instr->definitions[0].tempId()]) + ctx.uses[info.instr->operands[0].tempId()]++; + instr->operands[0].setTemp(info.instr->operands[0].getTemp()); + break; } } diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 91fc663927c..510b9e196da 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -511,6 +511,11 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (i && !can_swap_operands(instr, &instr->opcode)) continue; + bool input_mods = instr_info.can_use_input_modifiers[(int)instr->opcode] && + instr_info.operand_size[(int)instr->opcode] == 32; + if (!dpp8 && (mov->dpp16().neg[0] || mov->dpp16().abs[0]) && !input_mods) + continue; + if (!dpp8) /* anything else doesn't make sense in SSA */ assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf); diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index e501fd076bd..043602d8626 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1064,6 +1064,22 @@ BEGIN_TEST(optimizer.dpp) res7->vop3().abs[0] = true; writeout(7, res7); + //! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 + //! v1: %res11 = v_add_u32 %tmp11, %b + //! p_unit_test 11, %res11 + auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); + tmp11->dpp16().neg[0] = true; + Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b); + writeout(11, res11); + + //! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 + //! v1: %res12 = v_add_f16 %tmp12, %b + //! p_unit_test 12, %res12 + auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); + tmp12->dpp16().neg[0] = true; + Temp res12 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1), tmp12, b); + writeout(12, res12); + /* vcc */ //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 //! p_unit_test 8, %res8 diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index c5f0a3bf701..066f74f7510 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -409,6 +409,22 @@ BEGIN_TEST(optimizer_postRA.dpp) res7->vop3().abs[0] = true; writeout(7, Operand(res7, reg_v2)); + //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 + //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1] + //! p_unit_test 12, %res12:v[2] + auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); + tmp12->dpp16().neg[0] = true; + Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b); + writeout(12, Operand(res12, reg_v2)); + + //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 + //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1] + //! p_unit_test 13, %res13:v[2] + auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); + tmp13->dpp16().neg[0] = true; + Temp res13 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1, reg_v2), Operand(tmp13, reg_v2), b); + writeout(13, Operand(res13, reg_v2)); + /* vcc */ //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 //! p_unit_test 8, %res8:v[2]