diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index e520da50fec..4c404bab7e8 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -662,14 +662,14 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst encoding |= vop3.neg_lo[i] << (29 + i); out.push_back(encoding); - } else if (instr->isDPP()) { + } else if (instr->isDPP16()) { assert(ctx.chip_class >= GFX8); - DPP_instruction& dpp = instr->dpp(); + DPP16_instruction& dpp = instr->dpp16(); /* first emit the instruction without the DPP operand */ Operand dpp_op = instr->operands[0]; instr->operands[0] = Operand(PhysReg{250}, v1); - instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP); + instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP16); emit_instruction(ctx, out, instr); uint32_t encoding = (0xF & dpp.row_mask) << 28; encoding |= (0xF & dpp.bank_mask) << 24; @@ -684,6 +684,20 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst encoding |= (0xFF) & dpp_op.physReg(); out.push_back(encoding); return; + } else if (instr->isDPP8()) { + assert(ctx.chip_class >= GFX10); + DPP8_instruction& dpp = instr->dpp8(); + + /* first emit the instruction without the DPP operand */ + Operand dpp_op = instr->operands[0]; + instr->operands[0] = Operand(PhysReg{234}, v1); + instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP8); + emit_instruction(ctx, out, instr); + uint32_t encoding = (0xFF) & dpp_op.physReg(); + for (unsigned i = 0; i < 8; ++i) + encoding |= dpp.lane_sel[i] << (8 + i * 3); + out.push_back(encoding); + return; } else if (instr->isSDWA()) { SDWA_instruction& sdwa = instr->sdwa(); diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 9ebf633eef0..a5a9eee22f2 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -536,9 +536,12 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("vop3", [Format.VOP3], 'VOP3_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), ("vop3p", [Format.VOP3P], 'VOP3P_instruction', [(1, 2), (1, 3)]), ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]), - ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]), - ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])), - ("vopc_dpp", [Format.VOPC, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2])), + ("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]), + ("vop2_dpp", [Format.VOP2, Format.DPP16], 'DPP16_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_dpp", [Format.VOPC, Format.DPP16], 'DPP16_instruction', itertools.product([1, 2], [2])), + ("vop1_dpp8", [Format.VOP1, Format.DPP8], 'DPP8_instruction', [(1, 1)]), + ("vop2_dpp8", [Format.VOP2, Format.DPP8], 'DPP8_instruction', itertools.product([1, 2], [2, 3])), + ("vopc_dpp8", [Format.VOPC, Format.DPP8], 'DPP8_instruction', itertools.product([1, 2], [2])), ("vop1_e64", [Format.VOP1, Format.VOP3], 'VOP3_instruction', itertools.product([1], [1])), ("vop2_e64", [Format.VOP2, Format.VOP3], 'VOP3_instruction', itertools.product([1, 2], [2, 3])), ("vopc_e64", [Format.VOPC, Format.VOP3], 'VOP3_instruction', itertools.product([1, 2], [2])), diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index eee2eb2e882..523b25e619e 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -292,12 +292,12 @@ convert_to_SDWA(chip_class chip, aco_ptr& instr) } bool -can_use_DPP(const aco_ptr& instr, bool pre_ra) +can_use_DPP(const aco_ptr& instr, bool pre_ra, bool dpp8) { assert(instr->isVALU() && !instr->operands.empty()); if (instr->isDPP()) - return true; + return instr->isDPP8() == dpp8; if (instr->operands.size() && instr->operands[0].isLiteral()) return false; @@ -316,6 +316,8 @@ can_use_DPP(const aco_ptr& instr, bool pre_ra) const VOP3_instruction* vop3 = &instr->vop3(); if (vop3->clamp || vop3->omod || vop3->opsel) return false; + if (dpp8) + return false; if (instr->format == Format::VOP3) return false; if (instr->operands.size() > 1 && !instr->operands[1].isOfType(RegType::vgpr)) @@ -331,29 +333,39 @@ can_use_DPP(const aco_ptr& instr, bool pre_ra) } aco_ptr -convert_to_DPP(aco_ptr& instr) +convert_to_DPP(aco_ptr& instr, bool dpp8) { if (instr->isDPP()) return NULL; aco_ptr tmp = std::move(instr); - Format format = - (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | (uint32_t)Format::DPP); - instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), - tmp->definitions.size())); + Format format = (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | + (dpp8 ? (uint32_t)Format::DPP8 : (uint32_t)Format::DPP16)); + if (dpp8) + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), + tmp->definitions.size())); + else + instr.reset(create_instruction(tmp->opcode, format, tmp->operands.size(), + tmp->definitions.size())); std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin()); for (unsigned i = 0; i < instr->definitions.size(); i++) instr->definitions[i] = tmp->definitions[i]; - DPP_instruction* dpp = &instr->dpp(); - dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); - dpp->row_mask = 0xf; - dpp->bank_mask = 0xf; + if (dpp8) { + DPP8_instruction* dpp = &instr->dpp8(); + for (unsigned i = 0; i < 8; i++) + dpp->lane_sel[i] = i; + } else { + DPP16_instruction* dpp = &instr->dpp16(); + dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3); + dpp->row_mask = 0xf; + dpp->bank_mask = 0xf; - if (tmp->isVOP3()) { - const VOP3_instruction* vop3 = &tmp->vop3(); - memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg)); - memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs)); + if (tmp->isVOP3()) { + const VOP3_instruction* vop3 = &tmp->vop3(); + memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg)); + memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs)); + } } if (instr->isVOPC() || instr->definitions.size() > 1) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index e8c3e56c2dd..39b2e3aa70f 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -102,8 +102,9 @@ enum class Format : std::uint16_t { VOP3 = 1 << 11, /* Vector Parameter Interpolation Format */ VINTRP = 1 << 12, - DPP = 1 << 13, + DPP16 = 1 << 13, SDWA = 1 << 14, + DPP8 = 1 << 15, }; enum class instr_class : uint8_t { @@ -294,7 +295,7 @@ asSDWA(Format format) constexpr Format withoutDPP(Format format) { - return (Format)((uint32_t)format & ~(uint32_t)Format::DPP); + return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8)); } enum class RegType { @@ -996,7 +997,8 @@ struct VOP2_instruction; struct VOPC_instruction; struct VOP3_instruction; struct Interp_instruction; -struct DPP_instruction; +struct DPP16_instruction; +struct DPP8_instruction; struct SDWA_instruction; struct Instruction { @@ -1282,17 +1284,29 @@ struct Instruction { return *(Interp_instruction*)this; } constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; } - DPP_instruction& dpp() noexcept + DPP16_instruction& dpp16() noexcept { - assert(isDPP()); - return *(DPP_instruction*)this; + assert(isDPP16()); + return *(DPP16_instruction*)this; } - const DPP_instruction& dpp() const noexcept + const DPP16_instruction& dpp16() const noexcept { - assert(isDPP()); - return *(DPP_instruction*)this; + assert(isDPP16()); + return *(DPP16_instruction*)this; } - constexpr bool isDPP() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP; } + constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; } + DPP8_instruction& dpp8() noexcept + { + assert(isDPP8()); + return *(DPP8_instruction*)this; + } + const DPP8_instruction& dpp8() const noexcept + { + assert(isDPP8()); + return *(DPP8_instruction*)this; + } + constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; } + constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); } SDWA_instruction& sdwa() noexcept { assert(isSDWA()); @@ -1405,7 +1419,7 @@ static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected * The swizzle applies to the src0 operand. * */ -struct DPP_instruction : public Instruction { +struct DPP16_instruction : public Instruction { bool abs[2]; bool neg[2]; uint16_t dpp_ctrl; @@ -1414,7 +1428,12 @@ struct DPP_instruction : public Instruction { bool bound_ctrl : 1; uint8_t padding : 7; }; -static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); +static_assert(sizeof(DPP16_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); + +struct DPP8_instruction : public Instruction { + uint8_t lane_sel[8]; +}; +static_assert(sizeof(DPP8_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); struct SubdwordSel { enum sdwa_sel : uint8_t { @@ -1760,10 +1779,10 @@ bool is_dead(const std::vector& uses, Instruction* instr); bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high); bool instr_is_16bit(chip_class chip, aco_opcode op); bool can_use_SDWA(chip_class chip, const aco_ptr& instr, bool pre_ra); -bool can_use_DPP(const aco_ptr& instr, bool pre_ra); +bool can_use_DPP(const aco_ptr& instr, bool pre_ra, bool dpp8); /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ aco_ptr convert_to_SDWA(chip_class chip, aco_ptr& instr); -aco_ptr convert_to_DPP(aco_ptr& instr); +aco_ptr convert_to_DPP(aco_ptr& instr, bool dpp8); bool needs_exec_mask(const Instruction* instr); aco_opcode get_ordered(aco_opcode op); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index bb027180456..16494a701c4 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -74,8 +74,9 @@ class Format(Enum): VOPC = 1 << 10 VOP3 = 1 << 11 VINTRP = 1 << 12 - DPP = 1 << 13 + DPP16 = 1 << 13 SDWA = 1 << 14 + DPP8 = 1 << 15 def get_builder_fields(self): if self == Format.SOPK: @@ -147,7 +148,7 @@ class Format(Enum): elif self == Format.VINTRP: return [('unsigned', 'attribute', None), ('unsigned', 'component', None)] - elif self == Format.DPP: + elif self == Format.DPP16: return [('uint16_t', 'dpp_ctrl', None), ('uint8_t', 'row_mask', '0xF'), ('uint8_t', 'bank_mask', '0xF'), diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index d650665e0d0..dba86b87538 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -87,8 +87,11 @@ struct InstrHash { if (instr->isVOP3()) return hash_murmur_32(instr); - if (instr->isDPP()) - return hash_murmur_32(instr); + if (instr->isDPP16()) + return hash_murmur_32(instr); + + if (instr->isDPP8()) + return hash_murmur_32(instr); if (instr->isSDWA()) return hash_murmur_32(instr); @@ -172,15 +175,21 @@ struct InstrPred { } return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel; } - if (a->isDPP()) { - DPP_instruction& aDPP = a->dpp(); - DPP_instruction& bDPP = b->dpp(); + if (a->isDPP16()) { + DPP16_instruction& aDPP = a->dpp16(); + DPP16_instruction& bDPP = b->dpp16(); return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl && aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask && aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] && aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] && aDPP.neg[1] == bDPP.neg[1]; } + if (a->isDPP8()) { + DPP8_instruction& aDPP = a->dpp8(); + DPP8_instruction& bDPP = b->dpp8(); + return aDPP.pass_flags == bDPP.pass_flags && + !memcmp(aDPP.lane_sel, bDPP.lane_sel, sizeof(aDPP.lane_sel)); + } if (a->isSDWA()) { SDWA_instruction& aSDWA = a->sdwa(); SDWA_instruction& bSDWA = b->sdwa(); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 2c346b976ba..031b88745ae 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -120,12 +120,14 @@ enum Label { label_canonicalized = 1ull << 32, label_extract = 1ull << 33, label_insert = 1ull << 34, - label_dpp = 1ull << 35, + label_dpp16 = 1ull << 35, + label_dpp8 = 1ull << 36, }; static constexpr uint64_t instr_usedef_labels = label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise | - label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp; + label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | + label_dpp8; static constexpr uint64_t instr_mod_labels = label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert; @@ -455,13 +457,21 @@ struct ssa_info { bool is_insert() { return label & label_insert; } - void set_dpp(Instruction* mov) + void set_dpp16(Instruction* mov) { - add_label(label_dpp); + add_label(label_dpp16); instr = mov; } - bool is_dpp() { return label & label_dpp; } + void set_dpp8(Instruction* mov) + { + add_label(label_dpp8); + instr = mov; + } + + bool is_dpp() { return label & (label_dpp16 | label_dpp8); } + bool is_dpp16() { return label & label_dpp16; } + bool is_dpp8() { return label & label_dpp8; } }; struct opt_ctx { @@ -1215,7 +1225,7 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSDWA()) can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4; else - can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr)); + can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr)); if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) { instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32; @@ -1228,8 +1238,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) if (!instr->isDPP() && !instr->isSDWA()) to_VOP3(ctx, instr); instr->operands[i].setTemp(info.temp); - if (instr->isDPP() && !instr->dpp().abs[i]) - instr->dpp().neg[i] = true; + if (instr->isDPP16() && !instr->dpp16().abs[i]) + instr->dpp16().neg[i] = true; else if (instr->isSDWA() && !instr->sdwa().abs[i]) instr->sdwa().neg[i] = true; else if (instr->isVOP3() && !instr->vop3().abs[i]) @@ -1239,8 +1249,8 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) if (!instr->isDPP() && !instr->isSDWA()) to_VOP3(ctx, instr); instr->operands[i] = Operand(info.temp); - if (instr->isDPP()) - instr->dpp().abs[i] = true; + if (instr->isDPP16()) + instr->dpp16().abs[i] = true; else if (instr->isSDWA()) instr->sdwa().abs[i] = true; else @@ -1579,10 +1589,12 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) } break; case aco_opcode::v_mov_b32: - if (instr->isDPP()) { + if (instr->isDPP16()) { /* anything else doesn't make sense in SSA */ - assert(instr->dpp().row_mask == 0xf && instr->dpp().bank_mask == 0xf); - ctx.info[instr->definitions[0].tempId()].set_dpp(instr.get()); + assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf); + ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get()); + } else if (instr->isDPP8()) { + ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get()); } break; case aco_opcode::p_is_helper: @@ -2250,10 +2262,10 @@ combine_inverse_comparison(opt_ctx& ctx, aco_ptr& instr) new_sdwa->clamp = cmp_sdwa.clamp; new_sdwa->omod = cmp_sdwa.omod; new_instr = new_sdwa; - } else if (cmp->isDPP()) { - DPP_instruction* new_dpp = create_instruction( - new_opcode, (Format)((uint16_t)Format::DPP | (uint16_t)Format::VOPC), 2, 1); - DPP_instruction& cmp_dpp = cmp->dpp(); + } else if (cmp->isDPP16()) { + DPP16_instruction* new_dpp = create_instruction( + new_opcode, (Format)((uint16_t)Format::DPP16 | (uint16_t)Format::VOPC), 2, 1); + DPP16_instruction& cmp_dpp = cmp->dpp16(); memcpy(new_dpp->abs, cmp_dpp.abs, sizeof(new_dpp->abs)); memcpy(new_dpp->neg, cmp_dpp.neg, sizeof(new_dpp->neg)); new_dpp->dpp_ctrl = cmp_dpp.dpp_ctrl; @@ -2261,6 +2273,12 @@ combine_inverse_comparison(opt_ctx& ctx, aco_ptr& instr) new_dpp->bank_mask = cmp_dpp.bank_mask; new_dpp->bound_ctrl = cmp_dpp.bound_ctrl; new_instr = new_dpp; + } else if (cmp->isDPP8()) { + DPP8_instruction* new_dpp = create_instruction( + new_opcode, (Format)((uint16_t)Format::DPP8 | (uint16_t)Format::VOPC), 2, 1); + DPP8_instruction& cmp_dpp = cmp->dpp8(); + memcpy(new_dpp->lane_sel, cmp_dpp.lane_sel, sizeof(new_dpp->lane_sel)); + new_instr = new_dpp; } else { new_instr = create_instruction(new_opcode, Format::VOPC, 2, 1); instr->definitions[0].setHint(vcc); @@ -4005,23 +4023,34 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) aco_opcode swapped_op; if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags && - (i == 0 || can_swap_operands(instr, &swapped_op)) && can_use_DPP(instr, true) && - !instr->isDPP()) { - convert_to_DPP(instr); - DPP_instruction* dpp = static_cast(instr.get()); - if (i) { - instr->opcode = swapped_op; - std::swap(instr->operands[0], instr->operands[1]); - std::swap(dpp->neg[0], dpp->neg[1]); - std::swap(dpp->abs[0], dpp->abs[1]); + (i == 0 || can_swap_operands(instr, &swapped_op)) && + can_use_DPP(instr, true, info.is_dpp8()) && !instr->isDPP()) { + bool dpp8 = info.is_dpp8(); + convert_to_DPP(instr, dpp8); + if (dpp8) { + DPP8_instruction* dpp = &instr->dpp8(); + for (unsigned j = 0; j < 8; ++j) + dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j]; + if (i) { + instr->opcode = swapped_op; + std::swap(instr->operands[0], instr->operands[1]); + } + } else { + DPP16_instruction* dpp = &instr->dpp16(); + if (i) { + instr->opcode = swapped_op; + std::swap(instr->operands[0], instr->operands[1]); + std::swap(dpp->neg[0], dpp->neg[1]); + std::swap(dpp->abs[0], dpp->abs[1]); + } + dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl; + dpp->bound_ctrl = info.instr->dpp16().bound_ctrl; + dpp->neg[0] ^= info.instr->dpp16().neg[0] && !dpp->abs[0]; + dpp->abs[0] |= info.instr->dpp16().abs[0]; } if (--ctx.uses[info.instr->definitions[0].tempId()]) ctx.uses[info.instr->operands[0].tempId()]++; instr->operands[0].setTemp(info.instr->operands[0].getTemp()); - dpp->dpp_ctrl = info.instr->dpp().dpp_ctrl; - dpp->bound_ctrl = info.instr->dpp().bound_ctrl; - dpp->neg[0] ^= info.instr->dpp().neg[0] && !dpp->abs[0]; - dpp->abs[0] |= info.instr->dpp().abs[0]; break; } } diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index 5019eb619f0..1953c00d4a6 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -386,7 +386,7 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) * */ - if (!instr->isVALU() || instr->isDPP() || !can_use_DPP(instr, false)) + if (!instr->isVALU() || instr->isDPP()) return; for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) { @@ -394,9 +394,12 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (!op_instr_idx.found()) continue; - Instruction* mov = ctx.get(op_instr_idx); + const Instruction* mov = ctx.get(op_instr_idx); if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP()) continue; + bool dpp8 = mov->isDPP8(); + if (!can_use_DPP(instr, false, dpp8)) + return; /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite * it's own operand before we use it. @@ -412,25 +415,34 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr& instr) if (i && !can_swap_operands(instr, &instr->opcode)) continue; - /* anything else doesn't make sense in SSA */ - assert(mov->dpp().row_mask == 0xf && mov->dpp().bank_mask == 0xf); + if (!dpp8) /* anything else doesn't make sense in SSA */ + assert(mov->dpp16().row_mask == 0xf && mov->dpp16().bank_mask == 0xf); if (--ctx.uses[mov->definitions[0].tempId()]) ctx.uses[mov->operands[0].tempId()]++; - convert_to_DPP(instr); + convert_to_DPP(instr, dpp8); - DPP_instruction* dpp = &instr->dpp(); - if (i) { - std::swap(dpp->operands[0], dpp->operands[1]); - std::swap(dpp->neg[0], dpp->neg[1]); - std::swap(dpp->abs[0], dpp->abs[1]); + if (dpp8) { + DPP8_instruction* dpp = &instr->dpp8(); + if (i) { + std::swap(dpp->operands[0], dpp->operands[1]); + } + dpp->operands[0] = mov->operands[0]; + memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel)); + } else { + DPP16_instruction* dpp = &instr->dpp16(); + if (i) { + std::swap(dpp->operands[0], dpp->operands[1]); + std::swap(dpp->neg[0], dpp->neg[1]); + std::swap(dpp->abs[0], dpp->abs[1]); + } + dpp->operands[0] = mov->operands[0]; + dpp->dpp_ctrl = mov->dpp16().dpp_ctrl; + dpp->bound_ctrl = true; + dpp->neg[0] ^= mov->dpp16().neg[0] && !dpp->abs[0]; + dpp->abs[0] |= mov->dpp16().abs[0]; } - dpp->operands[0] = mov->operands[0]; - dpp->dpp_ctrl = mov->dpp().dpp_ctrl; - dpp->bound_ctrl = true; - dpp->neg[0] ^= mov->dpp().neg[0] && !dpp->abs[0]; - dpp->abs[0] |= mov->dpp().abs[0]; return; } } diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 750f54a4c22..d3b24dc0a57 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -566,8 +566,8 @@ print_instr_format_specific(const Instruction* instr, FILE* output) fprintf(output, " clamp"); if (vop3.opsel & (1 << 3)) fprintf(output, " opsel_hi"); - } else if (instr->isDPP()) { - const DPP_instruction& dpp = instr->dpp(); + } else if (instr->isDPP16()) { + const DPP16_instruction& dpp = instr->dpp16(); if (dpp.dpp_ctrl <= 0xff) { fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3, (dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3); @@ -602,6 +602,11 @@ print_instr_format_specific(const Instruction* instr, FILE* output) fprintf(output, " bank_mask:0x%.1x", dpp.bank_mask); if (dpp.bound_ctrl) fprintf(output, " bound_ctrl:1"); + } else if (instr->isDPP8()) { + const DPP8_instruction& dpp = instr->dpp8(); + fprintf(output, " dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]", dpp.lane_sel[0], dpp.lane_sel[1], + dpp.lane_sel[2], dpp.lane_sel[3], dpp.lane_sel[4], dpp.lane_sel[5], dpp.lane_sel[6], + dpp.lane_sel[7]); } else if (instr->isSDWA()) { const SDWA_instruction& sdwa = instr->sdwa(); switch (sdwa.omod) { @@ -668,8 +673,8 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags) neg[i] = vop3.neg[i]; opsel[i] = vop3.opsel & (1 << i); } - } else if (instr->isDPP()) { - const DPP_instruction& dpp = instr->dpp(); + } else if (instr->isDPP16()) { + const DPP16_instruction& dpp = instr->dpp16(); for (unsigned i = 0; i < 2; ++i) { abs[i] = dpp.abs[i]; neg[i] = dpp.neg[i]; diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 445b4cd4918..7469fd8a964 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -116,7 +116,8 @@ validate_ir(Program* program) /* check base format */ Format base_format = instr->format; base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA); - base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP); + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP16); + base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP8); if ((uint32_t)base_format & (uint32_t)Format::VOP1) base_format = Format::VOP1; else if ((uint32_t)base_format & (uint32_t)Format::VOP2) diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index f33de4caf76..50fa46b1fb1 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -988,7 +988,7 @@ BEGIN_TEST(optimizer.dpp) //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 //! p_unit_test 3, %res3 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); - tmp3.instr->dpp().neg[0] = true; + tmp3.instr->dpp16().neg[0] = true; Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b); writeout(3, res3); @@ -1010,7 +1010,7 @@ BEGIN_TEST(optimizer.dpp) //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 //! p_unit_test 6, %res6 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); - tmp6.instr->dpp().neg[0] = true; + tmp6.instr->dpp16().neg[0] = true; auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b); res6.instr->vop3().abs[0] = true; writeout(6, res6); diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp b/src/amd/compiler/tests/test_optimizer_postRA.cpp index f0345296fc4..468a24cdb51 100644 --- a/src/amd/compiler/tests/test_optimizer_postRA.cpp +++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp @@ -319,7 +319,7 @@ BEGIN_TEST(optimizer_postRA.dpp) //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 //! p_unit_test 3, %res3:v[2] auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); - tmp3.instr->dpp().neg[0] = true; + tmp3.instr->dpp16().neg[0] = true; Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b); writeout(3, Operand(res3, reg_v2)); @@ -341,7 +341,7 @@ BEGIN_TEST(optimizer_postRA.dpp) //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 //! p_unit_test 6, %res6:v[2] auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); - tmp6.instr->dpp().neg[0] = true; + tmp6.instr->dpp16().neg[0] = true; auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b); res6.instr->vop3().abs[0] = true; writeout(6, Operand(res6, reg_v2));