diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 49508e96137..1500984a121 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -468,6 +468,35 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst out.push_back(encoding); break; } + case Format::VOPD: { + VOPD_instruction& vopd = instr->vopd(); + uint32_t encoding = (0b110010 << 26); + encoding |= reg(ctx, instr->operands[0]); + if (instr->opcode != aco_opcode::v_dual_mov_b32) + encoding |= reg(ctx, instr->operands[1], 8) << 9; + encoding |= (uint32_t)ctx.opcode[(int)vopd.opy] << 17; + encoding |= opcode << 22; + out.push_back(encoding); + + unsigned opy_start = instr->opcode == aco_opcode::v_dual_mov_b32 ? 1 : 2; + switch (instr->opcode) { + case aco_opcode::v_dual_fmac_f32: + case aco_opcode::v_dual_fmaak_f32: + case aco_opcode::v_dual_fmamk_f32: + case aco_opcode::v_dual_cndmask_b32: + case aco_opcode::v_dual_dot2acc_f32_f16: + case aco_opcode::v_dual_dot2acc_f32_bf16: opy_start = 3; break; + default: break; + } + + encoding = reg(ctx, instr->operands[opy_start]); + if (vopd.opy != aco_opcode::v_dual_mov_b32) + encoding |= reg(ctx, instr->operands[opy_start + 1], 8) << 9; + encoding |= (reg(ctx, instr->definitions[1], 8) >> 1) << 17; + encoding |= reg(ctx, instr->definitions[0], 8) << 24; + out.push_back(encoding); + break; + } case Format::DS: { DS_instruction& ds = instr->ds(); uint32_t encoding = (0b110110 << 26); diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index e4518a0f9c5..f5106a29847 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -578,6 +578,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("vopc_sdwa", [Format.VOPC, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2])), ("vop3", [Format.VOP3], 'VALU_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), ("vop3p", [Format.VOP3P], 'VALU_instruction', [(1, 2), (1, 3)]), + ("vopd", [Format.VOPD], 'VOPD_instruction', [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6)]), ("vinterp_inreg", [Format.VINTERP_INREG], 'VINTERP_inreg_instruction', [(1, 3)]), ("vintrp", [Format.VINTRP], 'VINTRP_instruction', [(1, 2), (1, 3)]), ("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]), diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index cd4ceb221ce..107961fa524 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -957,6 +957,7 @@ struct Pseudo_reduction_instruction; struct VALU_instruction; struct VINTERP_inreg_instruction; struct VINTRP_instruction; +struct VOPD_instruction; struct DPP16_instruction; struct DPP8_instruction; struct SDWA_instruction; @@ -1210,6 +1211,17 @@ struct Instruction { return *(VINTERP_inreg_instruction*)this; } constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; } + VOPD_instruction& vopd() noexcept + { + assert(isVOPD()); + return *(VOPD_instruction*)this; + } + const VOPD_instruction& vopd() const noexcept + { + assert(isVOPD()); + return *(VOPD_instruction*)this; + } + constexpr bool isVOPD() const noexcept { return format == Format::VOPD; } constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } @@ -1278,7 +1290,8 @@ struct Instruction { } constexpr bool isVALU() const noexcept { - return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG(); + return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() || + isVOPD(); } constexpr bool isSALU() const noexcept @@ -1368,6 +1381,12 @@ struct VINTERP_inreg_instruction : public VALU_instruction { static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); +struct VOPD_instruction : public VALU_instruction { + aco_opcode opy; + uint16_t padding; +}; +static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); + /** * Data Parallel Primitives Format: * This format can be used for VOP1, VOP2 or VOPC instructions. diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 1cf23b5e061..4a512113c3f 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -89,6 +89,7 @@ class Format(IntEnum): VINTRP = auto() # Vector ALU Formats VINTERP_INREG = auto() + VOPD = auto() VOP1 = 1 << 7 VOP2 = 1 << 8 VOPC = 1 << 9 @@ -186,6 +187,8 @@ class Format(IntEnum): elif self == Format.VOP3P: return [('uint8_t', 'opsel_lo', None), ('uint8_t', 'opsel_hi', None)] + elif self == Format.VOPD: + return [('aco_opcode', 'opy', None)] elif self == Format.VINTERP_INREG: return [('unsigned', 'wait_exp', 7), ('uint8_t', 'opsel', 0)] @@ -1272,6 +1275,29 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, defs, ops, cls opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops) +VOPD = { + (0x00, "v_dual_fmac_f32"), + (0x01, "v_dual_fmaak_f32"), + (0x02, "v_dual_fmamk_f32"), + (0x03, "v_dual_mul_f32"), + (0x04, "v_dual_add_f32"), + (0x05, "v_dual_sub_f32"), + (0x06, "v_dual_subrev_f32"), + (0x07, "v_dual_mul_dx9_zero_f32"), + (0x08, "v_dual_mov_b32"), + (0x09, "v_dual_cndmask_b32"), + (0x0a, "v_dual_max_f32"), + (0x0b, "v_dual_min_f32"), + (0x0c, "v_dual_dot2acc_f32_f16"), + (0x0d, "v_dual_dot2acc_f32_bf16"), + (0x10, "v_dual_add_nc_u32"), + (0x11, "v_dual_lshlrev_b32"), + (0x12, "v_dual_and_b32"), +} +for gfx11, name in VOPD: + opcode(name, -1, -1, -1, gfx11, format = Format.VOPD, cls = InstrClass.Valu32) + + # DS instructions: 3 inputs (1 addr, 2 data), 1 output DS = { (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"), diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index a0ed4ebae89..5a7ae9d94b9 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -443,6 +443,12 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " attr%d.%c", vintrp.attribute, "xyzw"[vintrp.component]); break; } + case Format::VOPD: { + const VOPD_instruction& vopd = instr->vopd(); + // TODO: beautify + fprintf(output, " %s", instr_info.name[(int)vopd.opy]); + break; + } case Format::DS: { const DS_instruction& ds = instr->ds(); if (ds.offset0) diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index a7106e98686..8c92e669a64 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -1055,3 +1055,62 @@ BEGIN_TEST(assembler.vop3_dpp) finish_assembler_test(); END_TEST + +BEGIN_TEST(assembler.vopd) + if (!setup_cs(NULL, GFX11)) + return; + + Definition dst_v0 = bld.def(v1); + dst_v0.setFixed(PhysReg(256)); + + Definition dst_v1 = bld.def(v1); + dst_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v0(bld.tmp(v1)); + op_v0.setFixed(PhysReg(256 + 0)); + + Operand op_v1(bld.tmp(v1)); + op_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v2(bld.tmp(v1)); + op_v2.setFixed(PhysReg(256 + 2)); + + Operand op_v3(bld.tmp(v1)); + op_v3.setFixed(PhysReg(256 + 3)); + + Operand op_s0(bld.tmp(s1)); + op_s0.setFixed(PhysReg(0)); + + Operand op_vcc(bld.tmp(s1)); + op_vcc.setFixed(vcc); + + //>> BB0: + //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0, + aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96), + aco_opcode::v_dual_mov_b32); + + //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102 + bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2, + aco_opcode::v_dual_mov_b32); + + //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103 + bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3, + aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2, + aco_opcode::v_dual_and_b32); + + //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702 + bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3, + op_vcc, aco_opcode::v_dual_cndmask_b32); + + finish_assembler_test(); +END_TEST