aco: shrink DPP8_instruction

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25525>
2025-12-21 15:50:11 +01:00 · 2023-10-02 15:44:49 +01:00 · 2023-10-02 15:44:49 +01:00 · 26fce534b5
commit 26fce534b5
parent e90b5385a0
10 changed files with 25 additions and 29 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -814,8 +814,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
         emit_instruction(ctx, out, instr);
         uint32_t encoding = reg(ctx, dpp_op, 8);
         encoding |= dpp.opsel[0] && !instr->isVOP3() ? 128 : 0;
-         for (unsigned i = 0; i < 8; ++i)
-            encoding |= dpp.lane_sel[i] << (8 + i * 3);
+         encoding |= dpp.lane_sel << 8;
         out.push_back(encoding);
         return;
      } else if (instr->isVOP3()) {
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -275,11 +275,10 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
      } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
         dpp_ctrl = dpp_row_xmask(xor_mask);
      } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
-         Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src);
-         for (unsigned i = 0; i < 8; i++) {
-            ret->dpp8().lane_sel[i] = ((i & and_mask) ^ xor_mask);
-         }
-         return ret;
+         uint32_t lane_sel = 0;
+         for (unsigned i = 0; i < 8; i++)
+            lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
+         return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel);
      } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
         uint64_t lane_mask = 0;
         for (unsigned i = 0; i < 16; i++)
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -455,8 +455,7 @@ convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)

   if (dpp8) {
      DPP8_instruction* dpp = &instr->dpp8();
-      for (unsigned i = 0; i < 8; i++)
-         dpp->lane_sel[i] = i;
+      dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
   } else {
      DPP16_instruction* dpp = &instr->dpp16();
      dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -1461,9 +1461,10 @@ struct DPP16_instruction : public VALU_instruction {
 static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");

 struct DPP8_instruction : public VALU_instruction {
-   uint8_t lane_sel[8];
+   uint32_t lane_sel : 24;
+   uint32_t padding : 8;
 };
-static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 8, "Unexpected padding");
+static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");

 struct SubdwordSel {
   enum sdwa_sel : uint8_t {
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -2831,12 +2831,12 @@ lower_to_hw_instr(Program* program)
                  Operand src0 = instr->operands[i];
                  Operand src1 = instr->operands[i + 4];

+                  uint32_t lane_sel_xor1 = 0;
+                  for (unsigned j = 0; j < 8; j++)
+                     lane_sel_xor1 |= (j ^ 1) << (j * 3);
+
                  /* Swap odd, even lanes of mrt0. */
-                  Builder::Result ret =
-                     bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0);
-                  for (unsigned j = 0; j < 8; j++) {
-                     ret->dpp8().lane_sel[j] = j ^ 1;
-                  }
+                  bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0, lane_sel_xor1);

                  /* Swap even lanes between mrt0 and mrt1. */
                  bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1,
@ -2845,11 +2845,8 @@ lower_to_hw_instr(Program* program)
                           Operand(clobber_vcc.physReg(), bld.lm));

                  /* Swap odd, even lanes of mrt0 again. */
-                  ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
-                                      Operand(tmp.physReg(), v1));
-                  for (unsigned j = 0; j < 8; j++) {
-                     ret->dpp8().lane_sel[j] = j ^ 1;
-                  }
+                  bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
+                                Operand(tmp.physReg(), v1), lane_sel_xor1);

                  mrt0[i] = Operand(dst0, v1);
                  mrt1[i] = Operand(dst1, v1);
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -161,6 +161,8 @@ class Format(Enum):
                 ('uint8_t', 'row_mask', '0xF'),
                 ('uint8_t', 'bank_mask', '0xF'),
                 ('bool', 'bound_ctrl', 'true')]
+      elif self == Format.DPP8:
+         return [('uint32_t', 'lane_sel', 0)]
      elif self == Format.VOP3P:
         return [('uint8_t', 'opsel_lo', None),
                 ('uint8_t', 'opsel_hi', None)]
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -186,8 +186,7 @@ struct InstrPred {
      if (a->isDPP8()) {
         DPP8_instruction& aDPP = a->dpp8();
         DPP8_instruction& bDPP = b->dpp8();
-         return aDPP.pass_flags == bDPP.pass_flags &&
-                !memcmp(aDPP.lane_sel, bDPP.lane_sel, sizeof(aDPP.lane_sel));
+         return aDPP.pass_flags == bDPP.pass_flags && aDPP.lane_sel == bDPP.lane_sel;
      }
      if (a->isSDWA()) {
         SDWA_instruction& aSDWA = a->sdwa();
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -4865,8 +4865,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)

         if (dpp8) {
            DPP8_instruction* dpp = &instr->dpp8();
-            for (unsigned j = 0; j < 8; ++j)
-               dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
+            dpp->lane_sel = info.instr->dpp8().lane_sel;
            if (mov_uses_mods)
               instr->format = asVOP3(instr->format);
         } else {
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@ -547,7 +547,7 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)

      if (dpp8) {
         DPP8_instruction* dpp = &instr->dpp8();
-         memcpy(dpp->lane_sel, mov->dpp8().lane_sel, sizeof(dpp->lane_sel));
+         dpp->lane_sel = mov->dpp8().lane_sel;
         if (mov_uses_mods)
            instr->format = asVOP3(instr->format);
      } else {
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@ -709,9 +709,10 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
         fprintf(output, " bound_ctrl:1");
   } else if (instr->isDPP8()) {
      const DPP8_instruction& dpp = instr->dpp8();
-      fprintf(output, " dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]", dpp.lane_sel[0], dpp.lane_sel[1],
-              dpp.lane_sel[2], dpp.lane_sel[3], dpp.lane_sel[4], dpp.lane_sel[5], dpp.lane_sel[6],
-              dpp.lane_sel[7]);
+      fprintf(output, " dpp8:[");
+      for (unsigned i = 0; i < 8; i++)
+         fprintf(output, "%s%u", i ? "," : "", (dpp.lane_sel >> (i * 3)) & 0x8);
+      fprintf(output, "]");
   } else if (instr->isSDWA()) {
      const SDWA_instruction& sdwa = instr->sdwa();
      if (!instr->isVOPC()) {