aco/optimizer: apply dpp to v_dot before RA for gfx10.3

This is a bit unusual, as we otherwise only use the VOP2 codesize
optimization opcodes in the register allocator.

But unless we change the scheduler to not split v_mov_b32_dpp and
v_dot, we have no other choice.

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40510>
This commit is contained in:
Georg Lehmann 2026-03-19 13:59:21 +01:00 committed by Marge Bot
parent 62f1268d78
commit 17a9ee7152
2 changed files with 59 additions and 5 deletions

View file

@ -997,9 +997,32 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info)
if (is_dpp_or_sdwa && !format_is(info.format, Format::VOPC) && info.defs[0].size() != 1)
return false;
if (is_dpp && !opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
format_is(info.format, Format::VOP3P)))
return false;
if (is_dpp) {
if ((info.opcode == aco_opcode::v_dot2_f32_f16 || info.opcode == aco_opcode::v_dot4_i32_i8) &&
ctx.program->gfx_level >= GFX10 && ctx.program->gfx_level <= GFX10_3) {
/* DPP only supports v_dotc for GFX10(.3), but it's really important it gets applied.
* So already do the transformation before RA.
*/
if (neg || abs || vmask != 0x7 || opsel || !info.operands[0].extract[1].offset() ||
!info.operands[1].extract[1].offset())
return false;
if (info.opcode == aco_opcode::v_dot2_f32_f16)
info.opcode = aco_opcode::v_dot2c_f32_f16;
else
info.opcode = aco_opcode::v_dot4c_i32_i8;
if (info.operands[0].dpp16)
info.format = format_combine(Format::VOP2, Format::DPP16);
else if (info.operands[0].dpp8)
info.format = format_combine(Format::VOP2, Format::DPP8);
return true;
} else if (!opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
format_is(info.format, Format::VOP3P))) {
return false;
}
}
if (format_is(info.format, Format::VOP1) || format_is(info.format, Format::VOP2) ||
format_is(info.format, Format::VOPC) || format_is(info.format, Format::VOP3)) {
@ -1222,11 +1245,11 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info)
return false;
switch (instr->opcode) {
case aco_opcode::v_dot2c_f32_f16:
case aco_opcode::v_dot4c_i32_i8: assert(instr->isDPP()); return false;
case aco_opcode::s_addk_i32:
case aco_opcode::s_cmovk_i32:
case aco_opcode::s_mulk_i32:
case aco_opcode::v_dot2c_f32_f16:
case aco_opcode::v_dot4c_i32_i8:
case aco_opcode::v_fmac_f32:
case aco_opcode::v_fmac_f16:
case aco_opcode::v_fmac_legacy_f32:
@ -5076,6 +5099,12 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (!alu_opt_info_is_valid(ctx, candidate))
continue;
/* Don't use dotc if it might need to mov the accumulator. */
if ((candidate.opcode == aco_opcode::v_dot2c_f32_f16 ||
candidate.opcode == aco_opcode::v_dot4c_i32_i8) &&
ctx.uses[candidate.operands[2].op.tempId()] > 1)
continue;
if (--ctx.uses[parent->definitions[0].tempId()])
ctx.uses[parent->operands[0].tempId()]++;
input_info.operands[i] = inner;

View file

@ -2376,3 +2376,28 @@ BEGIN_TEST(optimizer.pk_mul_pk_cvt)
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimizer.dotc_dpp)
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3] = p_startpgm
if (!setup_cs("v1 v1 v1 v1", GFX10_3))
return;
Temp a = inputs[0];
Temp b = inputs[1];
Temp c = inputs[2];
Temp d = inputs[3];
//! v1: %dot2 = v_dot2c_f32_f16 %a, %b, %c dpp8:[0,0,0,0,0,0,0,0] fi
//! p_unit_test 0, %dot2
Temp dpp = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a, 0);
Temp dot2 = bld.vop3p(aco_opcode::v_dot2_f32_f16, bld.def(v1), dpp, b, c, 0x0, 0x7);
writeout(0, dot2);
//! v1: %dot4 = v_dot4c_i32_i8 %a, %b, %d row_mirror bound_ctrl:1 fi
//! p_unit_test 1, %dot4
dpp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
Temp dot4 = bld.vop3p(aco_opcode::v_dot4_i32_i8, bld.def(v1), dpp, b, d, 0x0, 0x7);
writeout(1, dot4);
finish_opt_test();
END_TEST