From 631e18d4275dc46cf47c969e85d8ec2d3d0262be Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 5 Jun 2020 17:36:29 +0100 Subject: [PATCH] aco: create v_mad_u32_u24 fossil-db (Navi): Totals from 849 (0.61% of 138791) affected shaders: SGPRs: 38528 -> 38544 (+0.04%) VGPRs: 39860 -> 39856 (-0.01%) CodeSize: 2701880 -> 2702016 (+0.01%) MaxWaves: 9148 -> 9150 (+0.02%) Instrs: 509864 -> 509821 (-0.01%); split: -0.01%, +0.00% Cycles: 3400124 -> 3399628 (-0.01%); split: -0.02%, +0.00% VMEM: 262757 -> 262672 (-0.03%) SMEM: 59710 -> 59704 (-0.01%) Copies: 44461 -> 44466 (+0.01%) fossil-db (Polaris): Totals from 1487 (1.06% of 140385) affected shaders: SGPRs: 54688 -> 55840 (+2.11%) CodeSize: 2725608 -> 2725720 (+0.00%); split: -0.01%, +0.01% Instrs: 521394 -> 517710 (-0.71%) Cycles: 18474108 -> 18410964 (-0.34%) VMEM: 436992 -> 431028 (-1.36%); split: +0.06%, -1.43% SMEM: 124503 -> 122564 (-1.56%); split: +0.45%, -2.00% VClause: 21972 -> 22015 (+0.20%); split: -0.12%, +0.31% SClause: 14274 -> 14287 (+0.09%) Copies: 44407 -> 44411 (+0.01%); split: -0.02%, +0.03% PreSGPRs: 34318 -> 34321 (+0.01%); split: -0.00%, +0.01% Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/compiler/aco_optimizer.cpp | 14 +++++++------- src/amd/compiler/tests/test_optimizer.cpp | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5d810b306bb..47066d4080f 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1316,6 +1316,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); } break; + case aco_opcode::v_mul_u32_u24: + ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); + break; case aco_opcode::v_and_b32: { /* abs */ if (!instr->usesModifiers() && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && @@ -2324,12 +2327,6 @@ bool combine_add_bcnt(opt_ctx& ctx, aco_ptr& instr) if (instr->usesModifiers()) return false; - /* Do not combine if the carry-out is used. */ - if ((instr->opcode == aco_opcode::v_add_co_u32 || - instr->opcode == aco_opcode::v_add_co_u32_e64) && - ctx.uses[instr->definitions[1].tempId()]) - return false; - for (unsigned i = 0; i < 2; i++) { Instruction *op_instr = follow_operand(ctx, instr->operands[i]); if (op_instr && @@ -2912,6 +2909,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr } else if (instr->opcode == aco_opcode::v_add_u32) { if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; else if (combine_add_bcnt(ctx, instr)) ; + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2)) ; else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) { if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ; @@ -2924,8 +2922,10 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr } } else if (instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_add_co_u32_e64) { + bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0; if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ; - else combine_add_bcnt(ctx, instr); + else if (!carry_out && combine_add_bcnt(ctx, instr)) ; + else if (!carry_out) combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, aco_opcode::v_mad_u32_u24, "120", 1 | 2); } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 || instr->opcode == aco_opcode::v_sub_co_u32_e64) { diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 8d6805febb8..bf7b51bee69 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -723,3 +723,24 @@ BEGIN_TEST(optimize.minmax) finish_opt_test(); } END_TEST + +BEGIN_TEST(optimize.mad_32_24) + for (unsigned i = GFX8; i <= GFX9; i++) { + //>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm + if (!setup_cs("v1 v1 v1", (chip_class)i)) + continue; + + //! v1: %res0 = v_mad_u32_u24 %b, %c, %a + //! p_unit_test 0, %res0 + Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); + writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul)); + + //! v1: %res1_tmp = v_mul_u32_u24 %b, %c + //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp + //! p_unit_test 1, %res1 + mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); + writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp()); + + finish_opt_test(); + } +END_TEST