mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 09:28:07 +02:00
aco/optimizer: apply dpp to v_dot before RA for gfx10.3
This is a bit unusual, as we otherwise only use the VOP2 codesize optimization opcodes in the register allocator. But unless we change the scheduler to not split v_mov_b32_dpp and v_dot, we have no other choice. Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40510>
This commit is contained in:
parent
62f1268d78
commit
17a9ee7152
2 changed files with 59 additions and 5 deletions
|
|
@ -997,9 +997,32 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info)
|
|||
if (is_dpp_or_sdwa && !format_is(info.format, Format::VOPC) && info.defs[0].size() != 1)
|
||||
return false;
|
||||
|
||||
if (is_dpp && !opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
|
||||
format_is(info.format, Format::VOP3P)))
|
||||
return false;
|
||||
if (is_dpp) {
|
||||
if ((info.opcode == aco_opcode::v_dot2_f32_f16 || info.opcode == aco_opcode::v_dot4_i32_i8) &&
|
||||
ctx.program->gfx_level >= GFX10 && ctx.program->gfx_level <= GFX10_3) {
|
||||
/* DPP only supports v_dotc for GFX10(.3), but it's really important it gets applied.
|
||||
* So already do the transformation before RA.
|
||||
*/
|
||||
if (neg || abs || vmask != 0x7 || opsel || !info.operands[0].extract[1].offset() ||
|
||||
!info.operands[1].extract[1].offset())
|
||||
return false;
|
||||
|
||||
if (info.opcode == aco_opcode::v_dot2_f32_f16)
|
||||
info.opcode = aco_opcode::v_dot2c_f32_f16;
|
||||
else
|
||||
info.opcode = aco_opcode::v_dot4c_i32_i8;
|
||||
|
||||
if (info.operands[0].dpp16)
|
||||
info.format = format_combine(Format::VOP2, Format::DPP16);
|
||||
else if (info.operands[0].dpp8)
|
||||
info.format = format_combine(Format::VOP2, Format::DPP8);
|
||||
|
||||
return true;
|
||||
} else if (!opcode_supports_dpp(ctx.program->gfx_level, info.opcode,
|
||||
format_is(info.format, Format::VOP3P))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (format_is(info.format, Format::VOP1) || format_is(info.format, Format::VOP2) ||
|
||||
format_is(info.format, Format::VOPC) || format_is(info.format, Format::VOP3)) {
|
||||
|
|
@ -1222,11 +1245,11 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info)
|
|||
return false;
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_dot2c_f32_f16:
|
||||
case aco_opcode::v_dot4c_i32_i8: assert(instr->isDPP()); return false;
|
||||
case aco_opcode::s_addk_i32:
|
||||
case aco_opcode::s_cmovk_i32:
|
||||
case aco_opcode::s_mulk_i32:
|
||||
case aco_opcode::v_dot2c_f32_f16:
|
||||
case aco_opcode::v_dot4c_i32_i8:
|
||||
case aco_opcode::v_fmac_f32:
|
||||
case aco_opcode::v_fmac_f16:
|
||||
case aco_opcode::v_fmac_legacy_f32:
|
||||
|
|
@ -5076,6 +5099,12 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
if (!alu_opt_info_is_valid(ctx, candidate))
|
||||
continue;
|
||||
|
||||
/* Don't use dotc if it might need to mov the accumulator. */
|
||||
if ((candidate.opcode == aco_opcode::v_dot2c_f32_f16 ||
|
||||
candidate.opcode == aco_opcode::v_dot4c_i32_i8) &&
|
||||
ctx.uses[candidate.operands[2].op.tempId()] > 1)
|
||||
continue;
|
||||
|
||||
if (--ctx.uses[parent->definitions[0].tempId()])
|
||||
ctx.uses[parent->operands[0].tempId()]++;
|
||||
input_info.operands[i] = inner;
|
||||
|
|
|
|||
|
|
@ -2376,3 +2376,28 @@ BEGIN_TEST(optimizer.pk_mul_pk_cvt)
|
|||
finish_opt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(optimizer.dotc_dpp)
|
||||
//>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3] = p_startpgm
|
||||
if (!setup_cs("v1 v1 v1 v1", GFX10_3))
|
||||
return;
|
||||
|
||||
Temp a = inputs[0];
|
||||
Temp b = inputs[1];
|
||||
Temp c = inputs[2];
|
||||
Temp d = inputs[3];
|
||||
|
||||
//! v1: %dot2 = v_dot2c_f32_f16 %a, %b, %c dpp8:[0,0,0,0,0,0,0,0] fi
|
||||
//! p_unit_test 0, %dot2
|
||||
Temp dpp = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a, 0);
|
||||
Temp dot2 = bld.vop3p(aco_opcode::v_dot2_f32_f16, bld.def(v1), dpp, b, c, 0x0, 0x7);
|
||||
writeout(0, dot2);
|
||||
|
||||
//! v1: %dot4 = v_dot4c_i32_i8 %a, %b, %d row_mirror bound_ctrl:1 fi
|
||||
//! p_unit_test 1, %dot4
|
||||
dpp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
|
||||
Temp dot4 = bld.vop3p(aco_opcode::v_dot4_i32_i8, bld.def(v1), dpp, b, d, 0x0, 0x7);
|
||||
writeout(1, dot4);
|
||||
|
||||
finish_opt_test();
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue