diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index a0790a11e9b..b7426480c0a 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1296,6 +1296,33 @@ is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64 return offset >= min && offset <= max; } +bool +detect_clamp(Instruction* instr, unsigned* clamped_idx) +{ + VALU_instruction& valu = instr->valu(); + if (valu.omod != 0 || valu.opsel != 0) + return false; + + unsigned idx = 0; + bool found_zero = false, found_one = false; + bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; + for (unsigned i = 0; i < 3; i++) { + if (!valu.neg[i] && instr->operands[i].constantEquals(0)) + found_zero = true; + else if (!valu.neg[i] && + instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ + found_one = true; + else + idx = i; + } + if (found_zero && found_one && instr->operands[idx].isTemp()) { + *clamped_idx = idx; + return true; + } else { + return false; + } +} + void label_instruction(opt_ctx& ctx, aco_ptr& instr) { @@ -1882,22 +1909,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) break; case aco_opcode::v_med3_f16: case aco_opcode::v_med3_f32: { /* clamp */ - VALU_instruction& vop3 = instr->valu(); - if (vop3.abs != 0 || vop3.neg != 0 || vop3.omod != 0 || vop3.opsel != 0) - break; - - unsigned idx = 0; - bool found_zero = false, found_one = false; - bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16; - for (unsigned i = 0; i < 3; i++) { - if (instr->operands[i].constantEquals(0)) - found_zero = true; - else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */ - found_one = true; - else - idx = i; - } - if (found_zero && found_one && instr->operands[idx].isTemp()) + unsigned idx; + if (detect_clamp(instr.get(), &idx) && !instr->valu().abs && !instr->valu().neg) ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get()); break; } @@ -4503,6 +4516,19 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) */ ctx.mad_infos.emplace_back(nullptr, 0); ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1); + } else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) { + unsigned idx; + if (detect_clamp(instr.get(), &idx)) { + instr->format = asVOP3(Format::VOP2); + instr->operands[0] = instr->operands[idx]; + instr->operands[1] = Operand::zero(); + instr->opcode = + instr->opcode == aco_opcode::v_med3_f32 ? aco_opcode::v_add_f32 : aco_opcode::v_add_f16; + instr->valu().clamp = true; + instr->valu().abs = (uint8_t)instr->valu().abs[idx]; + instr->valu().neg = (uint8_t)instr->valu().neg[idx]; + instr->operands.pop_back(); + } } else { aco_opcode min, max, min3, max3, med3, minmax; bool some_gfx9_only; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 9692ff27b58..ae0b6b95192 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1174,12 +1174,12 @@ BEGIN_TEST(optimize.casts) writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16)); //! v1: %res3_tmp = v_mul_f32 %a, %a - //! v2b: %res3 = v_med3_f16 0, 1.0, %res3_tmp + //! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp //! p_unit_test 3, %res3 writeout(3, fsat(u2u16(fmul(a, a)))); //! v2b: %res4_tmp = v_mul_f16 %a16, %a16 - //! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp + //! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp //! p_unit_test 4, %res4 writeout(4, fsat(bld.as_uniform(fmul(a16, a16)))); @@ -1701,12 +1701,12 @@ BEGIN_TEST(optimize.mad_mix.cast) writeout(3, f2f32(u2u16(fmul(a, a)))); //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0 - //! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul + //! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp //! p_unit_test 4, %res4 writeout(4, fsat(u2u16(fmul(f2f32(a16), a)))); //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0 - //! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul + //! v1: %res5 = v_add_f32 %res5_mul, 0 clamp //! p_unit_test 5, %res5 writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a))))); diff --git a/src/amd/compiler/tests/test_sdwa.cpp b/src/amd/compiler/tests/test_sdwa.cpp index 45f8c3e96c0..d398373da31 100644 --- a/src/amd/compiler/tests/test_sdwa.cpp +++ b/src/amd/compiler/tests/test_sdwa.cpp @@ -551,7 +551,7 @@ BEGIN_TEST(optimize.sdwa.insert_modifiers) writeout(2, val); //! v1: %tmp3 = v_rcp_f32 %a dst_sel:ubyte0 src0_sel:dword - //! v1: %res3 = v_med3_f32 %tmp3, 0, 1.0 + //! v1: %res3 = v_add_f32 %tmp3, 0 clamp //! p_unit_test 3, %res3 val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]); val = bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u));