aco: add VOPD format

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23367>
2025-12-21 07:10:09 +01:00 · 2023-05-22 14:15:58 +01:00 · 2023-05-22 14:15:58 +01:00 · 6547e17e60
commit 6547e17e60
parent 54c52932d4
6 changed files with 141 additions and 1 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -468,6 +468,35 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
      out.push_back(encoding);
      break;
   }
   case Format::VOPD: {
      VOPD_instruction& vopd = instr->vopd();
      uint32_t encoding = (0b110010 << 26);
      encoding |= reg(ctx, instr->operands[0]);
      if (instr->opcode != aco_opcode::v_dual_mov_b32)
         encoding |= reg(ctx, instr->operands[1], 8) << 9;
      encoding |= (uint32_t)ctx.opcode[(int)vopd.opy] << 17;
      encoding |= opcode << 22;
      out.push_back(encoding);
      unsigned opy_start = instr->opcode == aco_opcode::v_dual_mov_b32 ? 1 : 2;
      switch (instr->opcode) {
      case aco_opcode::v_dual_fmac_f32:
      case aco_opcode::v_dual_fmaak_f32:
      case aco_opcode::v_dual_fmamk_f32:
      case aco_opcode::v_dual_cndmask_b32:
      case aco_opcode::v_dual_dot2acc_f32_f16:
      case aco_opcode::v_dual_dot2acc_f32_bf16: opy_start = 3; break;
      default: break;
      }
      encoding = reg(ctx, instr->operands[opy_start]);
      if (vopd.opy != aco_opcode::v_dual_mov_b32)
         encoding |= reg(ctx, instr->operands[opy_start + 1], 8) << 9;
      encoding |= (reg(ctx, instr->definitions[1], 8) >> 1) << 17;
      encoding |= reg(ctx, instr->definitions[0], 8) << 24;
      out.push_back(encoding);
      break;
   }
   case Format::DS: {
      DS_instruction& ds = instr->ds();
      uint32_t encoding = (0b110110 << 26);
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@ -578,6 +578,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
           ("vopc_sdwa", [Format.VOPC, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2])),
           ("vop3", [Format.VOP3], 'VALU_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]),
           ("vop3p", [Format.VOP3P], 'VALU_instruction', [(1, 2), (1, 3)]),
           ("vopd", [Format.VOPD], 'VOPD_instruction', [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6)]),
           ("vinterp_inreg", [Format.VINTERP_INREG], 'VINTERP_inreg_instruction', [(1, 3)]),
           ("vintrp", [Format.VINTRP], 'VINTRP_instruction', [(1, 2), (1, 3)]),
           ("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]),
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -957,6 +957,7 @@ struct Pseudo_reduction_instruction;
 struct VALU_instruction;
 struct VINTERP_inreg_instruction;
 struct VINTRP_instruction;
 struct VOPD_instruction;
 struct DPP16_instruction;
 struct DPP8_instruction;
 struct SDWA_instruction;
@ -1210,6 +1211,17 @@ struct Instruction {
      return *(VINTERP_inreg_instruction*)this;
   }
   constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
   VOPD_instruction& vopd() noexcept
   {
      assert(isVOPD());
      return *(VOPD_instruction*)this;
   }
   const VOPD_instruction& vopd() const noexcept
   {
      assert(isVOPD());
      return *(VOPD_instruction*)this;
   }
   constexpr bool isVOPD() const noexcept { return format == Format::VOPD; }
   constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
   constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
   constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
@ -1278,7 +1290,8 @@ struct Instruction {
   }
   constexpr bool isVALU() const noexcept
   {
-      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG();
+      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() ||
             isVOPD();
   }
   constexpr bool isSALU() const noexcept
@ -1368,6 +1381,12 @@ struct VINTERP_inreg_instruction : public VALU_instruction {
 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4,
              "Unexpected padding");
 struct VOPD_instruction : public VALU_instruction {
   aco_opcode opy;
   uint16_t padding;
 };
 static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
 /**
 * Data Parallel Primitives Format:
 * This format can be used for VOP1, VOP2 or VOPC instructions.
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -89,6 +89,7 @@ class Format(IntEnum):
   VINTRP = auto()
   # Vector ALU Formats
   VINTERP_INREG = auto()
   VOPD = auto()
   VOP1 = 1 << 7
   VOP2 = 1 << 8
   VOPC = 1 << 9
@ -186,6 +187,8 @@ class Format(IntEnum):
      elif self == Format.VOP3P:
         return [('uint8_t', 'opsel_lo', None),
                 ('uint8_t', 'opsel_hi', None)]
      elif self == Format.VOPD:
         return [('aco_opcode', 'opy', None)]
      elif self == Format.VINTERP_INREG:
         return [('unsigned', 'wait_exp', 7),
                 ('uint8_t', 'opsel', 0)]
@ -1272,6 +1275,29 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, defs, ops, cls
   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops)
 VOPD = {
   (0x00, "v_dual_fmac_f32"),
   (0x01, "v_dual_fmaak_f32"),
   (0x02, "v_dual_fmamk_f32"),
   (0x03, "v_dual_mul_f32"),
   (0x04, "v_dual_add_f32"),
   (0x05, "v_dual_sub_f32"),
   (0x06, "v_dual_subrev_f32"),
   (0x07, "v_dual_mul_dx9_zero_f32"),
   (0x08, "v_dual_mov_b32"),
   (0x09, "v_dual_cndmask_b32"),
   (0x0a, "v_dual_max_f32"),
   (0x0b, "v_dual_min_f32"),
   (0x0c, "v_dual_dot2acc_f32_f16"),
   (0x0d, "v_dual_dot2acc_f32_bf16"),
   (0x10, "v_dual_add_nc_u32"),
   (0x11, "v_dual_lshlrev_b32"),
   (0x12, "v_dual_and_b32"),
 }
 for gfx11, name in VOPD:
   opcode(name, -1, -1, -1, gfx11, format = Format.VOPD, cls = InstrClass.Valu32)
 # DS instructions: 3 inputs (1 addr, 2 data), 1 output
 DS = {
   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"),
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@ -443,6 +443,12 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
      fprintf(output, " attr%d.%c", vintrp.attribute, "xyzw"[vintrp.component]);
      break;
   }
   case Format::VOPD: {
      const VOPD_instruction& vopd = instr->vopd();
      // TODO: beautify
      fprintf(output, " %s", instr_info.name[(int)vopd.opy]);
      break;
   }
   case Format::DS: {
      const DS_instruction& ds = instr->ds();
      if (ds.offset0)
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@ -1055,3 +1055,62 @@ BEGIN_TEST(assembler.vop3_dpp)
   finish_assembler_test();
 END_TEST
 BEGIN_TEST(assembler.vopd)
   if (!setup_cs(NULL, GFX11))
      return;
   Definition dst_v0 = bld.def(v1);
   dst_v0.setFixed(PhysReg(256));
   Definition dst_v1 = bld.def(v1);
   dst_v1.setFixed(PhysReg(256 + 1));
   Operand op_v0(bld.tmp(v1));
   op_v0.setFixed(PhysReg(256 + 0));
   Operand op_v1(bld.tmp(v1));
   op_v1.setFixed(PhysReg(256 + 1));
   Operand op_v2(bld.tmp(v1));
   op_v2.setFixed(PhysReg(256 + 2));
   Operand op_v3(bld.tmp(v1));
   op_v3.setFixed(PhysReg(256 + 3));
   Operand op_s0(bld.tmp(s1));
   op_s0.setFixed(PhysReg(0));
   Operand op_vcc(bld.tmp(s1));
   op_vcc.setFixed(vcc);
   //>> BB0:
   //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101
   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, aco_opcode::v_dual_mov_b32);
   //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060
   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0,
            aco_opcode::v_dual_mov_b32);
   //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060
   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96),
            aco_opcode::v_dual_mov_b32);
   //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102
   bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
            aco_opcode::v_dual_mov_b32);
   //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103
   bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3,
            aco_opcode::v_dual_mov_b32);
   //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501
   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
            aco_opcode::v_dual_and_b32);
   //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702
   bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3,
            op_vcc, aco_opcode::v_dual_cndmask_b32);
   finish_assembler_test();
 END_TEST