From 17a9ee7152822f23ffc3022b5a3b0ecc117a8011 Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Thu, 19 Mar 2026 13:59:21 +0100
Subject: [PATCH] aco/optimizer: apply dpp to v_dot before RA for gfx10.3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a bit unusual, as we otherwise only use the VOP2 codesize
optimization opcodes in the register allocator.

But unless we change the scheduler to not split v_mov_b32_dpp and
v_dot, we have no other choice.

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40510>
---
 src/amd/compiler/aco_optimizer.cpp        | 39 ++++++++++++++++++++---
 src/amd/compiler/tests/test_optimizer.cpp | 25 +++++++++++++++
 2 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index aa1f0b587ce..65e32fae938 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -997,9 +997,32 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info)
    if (is_dpp_or_sdwa && !format_is(info.format, Format::VOPC) && info.defs[0].size() != 1)
       return false;
 
-   if (is_dpp && !opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
-                                      format_is(info.format, Format::VOP3P)))
-      return false;
+   if (is_dpp) {
+      if ((info.opcode == aco_opcode::v_dot2_f32_f16 || info.opcode == aco_opcode::v_dot4_i32_i8) &&
+          ctx.program->gfx_level >= GFX10 && ctx.program->gfx_level <= GFX10_3) {
+         /* DPP only supports v_dotc for GFX10(.3), but it's really important it gets applied.
+          * So already do the transformation before RA.
+          */
+         if (neg || abs || vmask != 0x7 || opsel || !info.operands[0].extract[1].offset() ||
+             !info.operands[1].extract[1].offset())
+            return false;
+
+         if (info.opcode == aco_opcode::v_dot2_f32_f16)
+            info.opcode = aco_opcode::v_dot2c_f32_f16;
+         else
+            info.opcode = aco_opcode::v_dot4c_i32_i8;
+
+         if (info.operands[0].dpp16)
+            info.format = format_combine(Format::VOP2, Format::DPP16);
+         else if (info.operands[0].dpp8)
+            info.format = format_combine(Format::VOP2, Format::DPP8);
+
+         return true;
+      } else if (!opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
+                                      format_is(info.format, Format::VOP3P))) {
+         return false;
+      }
+   }
 
    if (format_is(info.format, Format::VOP1) || format_is(info.format, Format::VOP2) ||
        format_is(info.format, Format::VOPC) || format_is(info.format, Format::VOP3)) {
@@ -1222,11 +1245,11 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info)
       return false;
 
    switch (instr->opcode) {
+   case aco_opcode::v_dot2c_f32_f16:
+   case aco_opcode::v_dot4c_i32_i8: assert(instr->isDPP()); return false;
    case aco_opcode::s_addk_i32:
    case aco_opcode::s_cmovk_i32:
    case aco_opcode::s_mulk_i32:
-   case aco_opcode::v_dot2c_f32_f16:
-   case aco_opcode::v_dot4c_i32_i8:
    case aco_opcode::v_fmac_f32:
    case aco_opcode::v_fmac_f16:
    case aco_opcode::v_fmac_legacy_f32:
@@ -5076,6 +5099,12 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          if (!alu_opt_info_is_valid(ctx, candidate))
             continue;
 
+         /* Don't use dotc if it might need to mov the accumulator. */
+         if ((candidate.opcode == aco_opcode::v_dot2c_f32_f16 ||
+              candidate.opcode == aco_opcode::v_dot4c_i32_i8) &&
+             ctx.uses[candidate.operands[2].op.tempId()] > 1)
+            continue;
+
          if (--ctx.uses[parent->definitions[0].tempId()])
             ctx.uses[parent->operands[0].tempId()]++;
          input_info.operands[i] = inner;
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index 2906547f708..29944c36f02 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -2376,3 +2376,28 @@ BEGIN_TEST(optimizer.pk_mul_pk_cvt)
       finish_opt_test();
    }
 END_TEST
+
+BEGIN_TEST(optimizer.dotc_dpp)
+   //>>  v1: %a:v[0],  v1: %b:v[1],  v1: %c:v[2],  v1: %d:v[3] = p_startpgm
+   if (!setup_cs("v1 v1 v1 v1", GFX10_3))
+      return;
+
+   Temp a = inputs[0];
+   Temp b = inputs[1];
+   Temp c = inputs[2];
+   Temp d = inputs[3];
+
+   //! v1: %dot2 = v_dot2c_f32_f16 %a, %b, %c dpp8:[0,0,0,0,0,0,0,0] fi
+   //! p_unit_test 0, %dot2
+   Temp dpp = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a, 0);
+   Temp dot2 = bld.vop3p(aco_opcode::v_dot2_f32_f16, bld.def(v1), dpp, b, c, 0x0, 0x7);
+   writeout(0, dot2);
+
+   //!  v1: %dot4 = v_dot4c_i32_i8 %a, %b, %d row_mirror bound_ctrl:1 fi
+   //! p_unit_test 1, %dot4
+   dpp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   Temp dot4 = bld.vop3p(aco_opcode::v_dot4_i32_i8, bld.def(v1), dpp, b, d, 0x0, 0x7);
+   writeout(1, dot4);
+
+   finish_opt_test();
+END_TEST