diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index b324f484075..4266696b9fb 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -114,6 +114,7 @@ enum Label { label_vcc_hint = 1 << 25, label_scc_needed = 1 << 26, label_b2i = 1 << 27, + label_fcanonicalize = 1 << 28, label_constant_16bit = 1 << 29, label_usedef = 1 << 30, /* generic label */ label_vop3p = 1 << 31, @@ -125,7 +126,7 @@ static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_o static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels; static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | - label_scc_invert | label_b2i; + label_scc_invert | label_b2i | label_fcanonicalize; static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect"); @@ -511,6 +512,18 @@ struct ssa_info { { return label & label_vop3p; } + + void set_fcanonicalize(Temp tmp) + { + add_label(label_fcanonicalize); + temp = tmp; + } + + bool is_fcanonicalize() + { + return label & label_fcanonicalize; + } + }; struct opt_ctx { @@ -876,6 +889,11 @@ bool fixed_to_exec(Operand op) return op.isFixed() && op.physReg() == exec; } +bool can_eliminate_fcanonicalize(aco_opcode op) +{ + return instr_info.can_use_input_modifiers[(int)op] && op != aco_opcode::v_cndmask_b32; +} + void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) { if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) { @@ -925,7 +943,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) /* VALU: propagate neg, abs & inline constants */ else if (instr->isVALU()) { - if (info.is_temp() && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) { + bool is_fp = can_eliminate_fcanonicalize(instr->opcode); + if ((info.is_temp() || (info.is_fcanonicalize() && is_fp)) && info.temp.type() == RegType::vgpr && valu_can_accept_vgpr(instr, i)) { instr->operands[i].setTemp(info.temp); info = ctx.info[info.temp.id()]; } @@ -1316,6 +1335,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) !(fp16 ? block.fp_mode.must_flush_denorms16_64 : block.fp_mode.must_flush_denorms32) && !instr->definitions[0].isPrecise()) { /* 1.0 */ ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); + } else if (instr->operands[!i].constantValue() == (fp16 ? 0x3c00 : 0x3f800000)) { + ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(instr->operands[i].getTemp()); } else if (instr->operands[!i].constantValue() == 0u && !(fp16 ? block.fp_mode.preserve_signed_zero_inf_nan16_64 : block.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */ ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u); @@ -2530,7 +2551,8 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId(); } ssa_info& info = ctx.info[instr->operands[i].tempId()]; - if (info.is_temp() && info.temp.type() == RegType::sgpr) + if ((info.is_temp() || (info.is_fcanonicalize() && can_eliminate_fcanonicalize(instr->opcode))) && + info.temp.type() == RegType::sgpr) operand_mask |= 1u << i; } unsigned max_sgprs = 1;