diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index ccd53b2fff9..63e8aba1ce6 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1101,6 +1101,33 @@ create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1, bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed)); } +void +emit_v_mov_b16(Builder& bld, Definition dst, Operand op) +{ + /* v_mov_b16 uses 32bit inline constants. */ + if (op.isConstant()) { + if (!op.isLiteral() && op.physReg() >= 240) { + /* v_add_f16 is smaller because it can use 16bit fp inline constants. */ + Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero()); + if (dst.physReg().byte() == 2) + instr->vop3().opsel = 0x8; + return; + } + op = Operand::c32((int32_t)(int16_t)op.constantValue()); + } + + if (!dst.physReg().byte() && !op.physReg().byte()) { + bld.vop1(aco_opcode::v_mov_b16, dst, op); + } else { + // TODO: this can use VOP1 for vgpr0-127 with assembler support + Instruction* instr = bld.vop1_e64(aco_opcode::v_mov_b16, dst, op); + if (op.physReg().byte() == 2) + instr->vop3().opsel |= 0x1; + if (dst.physReg().byte() == 2) + instr->vop3().opsel |= 0x8; + } +} + void copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) { @@ -1166,6 +1193,8 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) } else { bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32); } + } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) { + emit_v_mov_b16(bld, dst, op); } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) { if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) { /* use v_mov_b32 to avoid possible issues with denormal flushing or @@ -1340,7 +1369,7 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres swiz[def.physReg().byte()] = op.physReg().byte(); create_bperm(bld, swiz, def, op); } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) { - addsub_subdword_gfx11(bld, def, op, Operand::zero(), false); + emit_v_mov_b16(bld, def, op); } else if (def.regClass().is_subdword()) { bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op); } else { @@ -1577,9 +1606,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera /* either hi or lo are already placed correctly */ if (ctx->program->gfx_level >= GFX11) { if (lo.physReg().reg() == def.physReg().reg()) - addsub_subdword_gfx11(bld, def_hi, hi, Operand::zero(), false); + emit_v_mov_b16(bld, def_hi, hi); else - addsub_subdword_gfx11(bld, def_lo, lo, Operand::zero(), false); + emit_v_mov_b16(bld, def_lo, lo); return; } else if (ctx->program->gfx_level >= GFX8) { if (lo.physReg().reg() == def.physReg().reg()) diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 035611d9089..ecd7d38441e 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -895,6 +895,7 @@ VOP1 = { ( -1, -1, -1, -1, -1, 0x69, "v_not_b16", False, False), ( -1, -1, -1, -1, -1, 0x6a, "v_cvt_i32_i16", False, False), ( -1, -1, -1, -1, -1, 0x6b, "v_cvt_u32_u16", False, False), + ( -1, -1, -1, -1, -1, 0x1c, "v_mov_b16", True, False), } for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, cls) in default_class(VOP1, InstrClass.Valu32): opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP1, cls, in_mod, out_mod) diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 9d2a27201ee..f6d08924a9f 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -241,7 +241,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 - //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi + //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), @@ -253,8 +253,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 - //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[1][16:32]), 0 opsel_hi - //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 %0:v[0][0:16], 0 opsel_hi + //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi + //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16] //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] @@ -270,7 +270,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 - //~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 %0:v[0][0:16], 0 + //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16] //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); bld.pseudo(aco_opcode::p_parallelcopy, @@ -285,7 +285,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104 - //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi + //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b), @@ -386,7 +386,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx(8|9|11)! p_unit_test 11 //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1 - //~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 + //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32]) //~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), @@ -466,21 +466,23 @@ BEGIN_TEST(to_hw_instr.subdword_constant) /* 16-bit copy */ //! p_unit_test 6 //~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword - //~gfx11! v2b: %_:v[0][0:16] = v_pack_b32_f16 0.5, hi(%_:v[0][16:32]) + //~gfx11! v2b: %0:v[0][0:16] = v_add_f16 0.5, 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800)); //! p_unit_test 7 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0] - //~gfx(10|11)! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205)); //! p_unit_test 8 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0] - //~gfx(10|11)! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205 + //~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205 + //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205)); @@ -530,7 +532,8 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //>> p_unit_test 13 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0] - //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d + //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d + //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff)); @@ -538,14 +541,15 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 14 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0] - //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504 + //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504 + //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00)); //! p_unit_test 15 //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0c + //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::zero(2));