diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 961cce9d20b..56e4004a827 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -880,21 +880,21 @@ emit_vop3_instruction(asm_context& ctx, std::vector& out, Instruction* */ if (instr->definitions.size() == 2 && instr->isVOPC()) assert(ctx.gfx_level <= GFX9 && instr->definitions[1].physReg() == exec); - else if (instr->definitions.size() == 2) + else if (instr->definitions.size() == 2 && instr->opcode != aco_opcode::v_swap_b16) encoding |= reg(ctx, instr->definitions[1]) << 8; encoding |= reg(ctx, instr->definitions[0], 8); out.push_back(encoding); encoding = 0; - if (instr->opcode == aco_opcode::v_interp_mov_f32) { - encoding = 0x3 & instr->operands[0].constantValue(); - } else if (instr->opcode == aco_opcode::v_writelane_b32_e64) { - encoding |= reg(ctx, instr->operands[0]) << 0; - encoding |= reg(ctx, instr->operands[1]) << 9; - /* Encoding src2 works fine with hardware but breaks some disassemblers. */ - } else { - for (unsigned i = 0; i < instr->operands.size(); i++) - encoding |= reg(ctx, instr->operands[i]) << (i * 9); - } + + unsigned num_ops = instr->operands.size(); + /* Encoding implicit sources works fine with hardware but breaks some disassemblers. */ + if (instr->opcode == aco_opcode::v_writelane_b32_e64) + num_ops = 2; + else if (instr->opcode == aco_opcode::v_swap_b16) + num_ops = 1; + + for (unsigned i = 0; i < num_ops; i++) + encoding |= reg(ctx, instr->operands[i]) << (i * 9); encoding |= vop3.omod << 27; for (unsigned i = 0; i < 3; i++) encoding |= vop3.neg[i] << (29 + i); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 2c131d39c82..153bc84e7f9 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -669,6 +669,7 @@ get_gfx11_true16_mask(aco_opcode op) case aco_opcode::v_sin_f16: case aco_opcode::v_sqrt_f16: case aco_opcode::v_trunc_f16: + case aco_opcode::v_swap_b16: case aco_opcode::v_mov_b16: return 0x1 | 0x8; case aco_opcode::v_add_f16: case aco_opcode::v_fmaak_f16: diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 75184742eda..b820b18fb00 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1317,19 +1317,6 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) } } -void -addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub) -{ - Instruction* instr = - bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr; - if (src0.physReg().byte() == 2) - instr->valu().opsel |= 0x1; - if (src1.physReg().byte() == 2) - instr->valu().opsel |= 0x2; - if (dst.physReg().byte() == 2) - instr->valu().opsel |= 0x8; -} - bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc, PhysReg scratch_sgpr) @@ -1390,9 +1377,9 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op) if (def.bytes() == 2) { Operand def_as_op = Operand(def.physReg(), def.regClass()); Definition op_as_def = Definition(op.physReg(), op.regClass()); - addsub_subdword_gfx11(bld, def, def_as_op, op, false); - addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true); - addsub_subdword_gfx11(bld, def, def_as_op, op, true); + Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op); + instr->valu().opsel[0] = op.physReg().byte(); + instr->valu().opsel[3] = def.physReg().byte(); } else { PhysReg op_half = op.physReg(); op_half.reg_b &= ~1; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index a56f7147d28..a424400045a 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -974,6 +974,7 @@ VOP1 = { ("v_cvt_i32_i16", False, False, dst(1), src(1), op(gfx11=0x6a)), ("v_cvt_u32_u16", False, False, dst(1), src(1), op(gfx11=0x6b)), ("v_mov_b16", True, False, dst(1), src(1), op(gfx11=0x1c)), + ("v_swap_b16", False, False, dst(1, 1), src(1, 1), op(gfx11=0x66)), } for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP1, InstrClass.Valu32): insn(name, num, Format.VOP1, cls, in_mod, out_mod, definitions = defs, operands = ops) diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index a211214edfd..b2dc7a24be6 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -53,9 +53,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0 //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi - //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16] - //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] - //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16] + //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b), Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b), @@ -130,13 +128,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3 - //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi - //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) + //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 - //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi - //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) + //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), Operand(v1_lo, v3b), Operand(v0_lo, v3b)); @@ -157,23 +151,15 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1 - //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi - //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] - //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi + //~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 - //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi - //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] - //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi + //~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2 - //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi - //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) + //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506 - //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi - //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) + //~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b), Operand(v1_b1, v2b), Operand(v0_b1, v2b));