aco: create v_mad_u32_u24

fossil-db (Navi):
Totals from 849 (0.61% of 138791) affected shaders:
SGPRs: 38528 -> 38544 (+0.04%)
VGPRs: 39860 -> 39856 (-0.01%)
CodeSize: 2701880 -> 2702016 (+0.01%)
MaxWaves: 9148 -> 9150 (+0.02%)
Instrs: 509864 -> 509821 (-0.01%); split: -0.01%, +0.00%
Cycles: 3400124 -> 3399628 (-0.01%); split: -0.02%, +0.00%
VMEM: 262757 -> 262672 (-0.03%)
SMEM: 59710 -> 59704 (-0.01%)
Copies: 44461 -> 44466 (+0.01%)

fossil-db (Polaris):
Totals from 1487 (1.06% of 140385) affected shaders:
SGPRs: 54688 -> 55840 (+2.11%)
CodeSize: 2725608 -> 2725720 (+0.00%); split: -0.01%, +0.01%
Instrs: 521394 -> 517710 (-0.71%)
Cycles: 18474108 -> 18410964 (-0.34%)
VMEM: 436992 -> 431028 (-1.36%); split: +0.06%, -1.43%
SMEM: 124503 -> 122564 (-1.56%); split: +0.45%, -2.00%
VClause: 21972 -> 22015 (+0.20%); split: -0.12%, +0.31%
SClause: 14274 -> 14287 (+0.09%)
Copies: 44407 -> 44411 (+0.01%); split: -0.02%, +0.03%
PreSGPRs: 34318 -> 34321 (+0.01%); split: -0.00%, +0.01%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7639>
This commit is contained in:
Rhys Perry 2020-06-05 17:36:29 +01:00 committed by Marge Bot
parent 1200f6da0b
commit 631e18d427
2 changed files with 28 additions and 7 deletions

View file

@ -1316,6 +1316,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
}
break;
case aco_opcode::v_mul_u32_u24:
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
break;
case aco_opcode::v_and_b32: { /* abs */
if (!instr->usesModifiers() && instr->operands[1].isTemp() &&
instr->operands[1].getTemp().type() == RegType::vgpr &&
@ -2324,12 +2327,6 @@ bool combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->usesModifiers())
return false;
/* Do not combine if the carry-out is used. */
if ((instr->opcode == aco_opcode::v_add_co_u32 ||
instr->opcode == aco_opcode::v_add_co_u32_e64) &&
ctx.uses[instr->definitions[1].tempId()])
return false;
for (unsigned i = 0; i < 2; i++) {
Instruction *op_instr = follow_operand(ctx, instr->operands[i]);
if (op_instr &&
@ -2912,6 +2909,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
} else if (instr->opcode == aco_opcode::v_add_u32) {
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
else if (combine_add_bcnt(ctx, instr)) ;
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ;
else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
@ -2924,8 +2922,10 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
}
} else if (instr->opcode == aco_opcode::v_add_co_u32 ||
instr->opcode == aco_opcode::v_add_co_u32_e64) {
bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
else combine_add_bcnt(ctx, instr);
else if (!carry_out && combine_add_bcnt(ctx, instr)) ;
else if (!carry_out) combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2);
} else if (instr->opcode == aco_opcode::v_sub_u32 ||
instr->opcode == aco_opcode::v_sub_co_u32 ||
instr->opcode == aco_opcode::v_sub_co_u32_e64) {

View file

@ -723,3 +723,24 @@ BEGIN_TEST(optimize.minmax)
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_32_24)
for (unsigned i = GFX8; i <= GFX9; i++) {
//>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm
if (!setup_cs("v1 v1 v1", (chip_class)i))
continue;
//! v1: %res0 = v_mad_u32_u24 %b, %c, %a
//! p_unit_test 0, %res0
Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
//! v1: %res1_tmp = v_mul_u32_u24 %b, %c
//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
//! p_unit_test 1, %res1
mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
finish_opt_test();
}
END_TEST