From 17a9ee7152822f23ffc3022b5a3b0ecc117a8011 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 19 Mar 2026 13:59:21 +0100 Subject: [PATCH] aco/optimizer: apply dpp to v_dot before RA for gfx10.3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a bit unusual, as we otherwise only use the VOP2 codesize optimization opcodes in the register allocator. But unless we change the scheduler to not split v_mov_b32_dpp and v_dot, we have no other choice. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 39 ++++++++++++++++++++--- src/amd/compiler/tests/test_optimizer.cpp | 25 +++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index aa1f0b587ce..65e32fae938 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -997,9 +997,32 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info) if (is_dpp_or_sdwa && !format_is(info.format, Format::VOPC) && info.defs[0].size() != 1) return false; - if (is_dpp && !opcode_supports_dpp(ctx.program->gfx_level, info.opcode, - format_is(info.format, Format::VOP3P))) - return false; + if (is_dpp) { + if ((info.opcode == aco_opcode::v_dot2_f32_f16 || info.opcode == aco_opcode::v_dot4_i32_i8) && + ctx.program->gfx_level >= GFX10 && ctx.program->gfx_level <= GFX10_3) { + /* DPP only supports v_dotc for GFX10(.3), but it's really important it gets applied. + * So already do the transformation before RA. + */ + if (neg || abs || vmask != 0x7 || opsel || !info.operands[0].extract[1].offset() || + !info.operands[1].extract[1].offset()) + return false; + + if (info.opcode == aco_opcode::v_dot2_f32_f16) + info.opcode = aco_opcode::v_dot2c_f32_f16; + else + info.opcode = aco_opcode::v_dot4c_i32_i8; + + if (info.operands[0].dpp16) + info.format = format_combine(Format::VOP2, Format::DPP16); + else if (info.operands[0].dpp8) + info.format = format_combine(Format::VOP2, Format::DPP8); + + return true; + } else if (!opcode_supports_dpp(ctx.program->gfx_level, info.opcode, + format_is(info.format, Format::VOP3P))) { + return false; + } + } if (format_is(info.format, Format::VOP1) || format_is(info.format, Format::VOP2) || format_is(info.format, Format::VOPC) || format_is(info.format, Format::VOP3)) { @@ -1222,11 +1245,11 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info) return false; switch (instr->opcode) { + case aco_opcode::v_dot2c_f32_f16: + case aco_opcode::v_dot4c_i32_i8: assert(instr->isDPP()); return false; case aco_opcode::s_addk_i32: case aco_opcode::s_cmovk_i32: case aco_opcode::s_mulk_i32: - case aco_opcode::v_dot2c_f32_f16: - case aco_opcode::v_dot4c_i32_i8: case aco_opcode::v_fmac_f32: case aco_opcode::v_fmac_f16: case aco_opcode::v_fmac_legacy_f32: @@ -5076,6 +5099,12 @@ select_instruction(opt_ctx& ctx, aco_ptr& instr) if (!alu_opt_info_is_valid(ctx, candidate)) continue; + /* Don't use dotc if it might need to mov the accumulator. */ + if ((candidate.opcode == aco_opcode::v_dot2c_f32_f16 || + candidate.opcode == aco_opcode::v_dot4c_i32_i8) && + ctx.uses[candidate.operands[2].op.tempId()] > 1) + continue; + if (--ctx.uses[parent->definitions[0].tempId()]) ctx.uses[parent->operands[0].tempId()]++; input_info.operands[i] = inner; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 2906547f708..29944c36f02 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -2376,3 +2376,28 @@ BEGIN_TEST(optimizer.pk_mul_pk_cvt) finish_opt_test(); } END_TEST + +BEGIN_TEST(optimizer.dotc_dpp) + //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3] = p_startpgm + if (!setup_cs("v1 v1 v1 v1", GFX10_3)) + return; + + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp c = inputs[2]; + Temp d = inputs[3]; + + //! v1: %dot2 = v_dot2c_f32_f16 %a, %b, %c dpp8:[0,0,0,0,0,0,0,0] fi + //! p_unit_test 0, %dot2 + Temp dpp = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a, 0); + Temp dot2 = bld.vop3p(aco_opcode::v_dot2_f32_f16, bld.def(v1), dpp, b, c, 0x0, 0x7); + writeout(0, dot2); + + //! v1: %dot4 = v_dot4c_i32_i8 %a, %b, %d row_mirror bound_ctrl:1 fi + //! p_unit_test 1, %dot4 + dpp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); + Temp dot4 = bld.vop3p(aco_opcode::v_dot4_i32_i8, bld.def(v1), dpp, b, d, 0x0, 0x7); + writeout(1, dot4); + + finish_opt_test(); +END_TEST