diff --git a/src/amd/compiler/aco_insert_fp_mode.cpp b/src/amd/compiler/aco_insert_fp_mode.cpp index 10d5ff51c2c..f251bb97c86 100644 --- a/src/amd/compiler/aco_insert_fp_mode.cpp +++ b/src/amd/compiler/aco_insert_fp_mode.cpp @@ -401,6 +401,15 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block) instr->opcode = aco_opcode::v_fma_mixlo_f16; else instr->opcode = aco_opcode::v_fma_mixhi_f16; + } else if (instr->opcode == aco_opcode::p_v_add_f64_rtne || + instr->opcode == aco_opcode::p_v_fract_f64_rtne) { + instr_state.require(mode_round16_64, fp_round_ne); + instr_state.require(mode_denorm16_64, default_state.fields[mode_denorm16_64]); + + if (instr->opcode == aco_opcode::p_v_add_f64_rtne) + instr->opcode = aco_opcode::v_add_f64_e64; + else + instr->opcode = aco_opcode::v_fract_f64; } else { mode_mask default_needs = instr_default_needs(ctx, instr); u_foreach_bit (i, default_needs) diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 965b1744374..b4d3489d4e3 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1079,6 +1079,7 @@ VOP1 = { ("v_frexp_exp_i32_f64", dst(U32), src(F64), op(0x3c, gfx8=0x30, gfx10=0x3c), InstrClass.ValuDouble), ("v_frexp_mant_f64", dst(noMods(F64)), src(F64), op(0x3d, gfx8=0x31, gfx10=0x3d), InstrClass.ValuDouble), ("v_fract_f64", dst(F64), src(F64), op(0x3e, gfx8=0x32, gfx10=0x3e), InstrClass.ValuDouble), + ("p_v_fract_f64_rtne", dst(F64), src(F64), op(-1), InstrClass.ValuDouble), # Used for lowering v_floor_f64 on GFX6 ("v_frexp_exp_i32_f32", dst(U32), src(F32), op(0x3f, gfx8=0x33, gfx10=0x3f)), ("v_frexp_mant_f32", dst(noMods(F32)), src(F32), op(0x40, gfx8=0x34, gfx10=0x40)), ("v_clrexcp", dst(), src(), op(0x41, gfx8=0x35, gfx10=0x41, gfx11=-1)), @@ -1334,6 +1335,7 @@ VOP3 = { ("v_lshr_b64", dst(U64), src(U64, U32), op(0x162, gfx8=-1), InstrClass.Valu64), ("v_ashr_i64", dst(I64), src(I64, U32), op(0x163, gfx8=-1), InstrClass.Valu64), ("v_add_f64_e64", dst(F64), src(F64, F64), op(0x164, gfx8=0x280, gfx10=0x164, gfx11=0x327, gfx12=-1), InstrClass.ValuDoubleAdd), # GFX12 is VOP2 + ("p_v_add_f64_rtne", dst(F64), src(F64, F64), op(-1), InstrClass.ValuDoubleAdd), # Used for lowering v_floor_f64 on GFX6 ("v_mul_f64_e64", dst(F64), src(F64, F64), op(0x165, gfx8=0x281, gfx10=0x165, gfx11=0x328, gfx12=-1), InstrClass.ValuDouble), # GFX12 is VOP2 ("v_min_f64_e64", dst(F64), src(F64, F64), op(0x166, gfx8=0x282, gfx10=0x166, gfx11=0x329, gfx12=-1), InstrClass.ValuDouble), # GFX12 is VOP2 ("v_max_f64_e64", dst(F64), src(F64, F64), op(0x167, gfx8=0x283, gfx10=0x167, gfx11=0x32a, gfx12=-1), InstrClass.ValuDouble), # GFX12 is VOP2 diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index a6bc1d85ec6..ac2c2f9a4f2 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3256,6 +3256,7 @@ backpropagate_input_modifiers(opt_ctx& ctx, alu_opt_info& info, const alu_opt_op case aco_opcode::s_add_f32: case aco_opcode::s_add_f16: case aco_opcode::v_pk_add_f16: + case aco_opcode::p_v_add_f64_rtne: case aco_opcode::v_fma_f64: case aco_opcode::v_fma_f32: case aco_opcode::v_fma_f16: