From 0359c8a90179b281d20d10aeebe220b199fb3ec5 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sat, 14 Dec 2024 18:56:57 +0100 Subject: [PATCH] aco/optimizer: use new helpers for v_add_u32 opts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi48: Totals from 1554 (1.89% of 82419) affected shaders: Instrs: 5154325 -> 5151499 (-0.05%); split: -0.08%, +0.02% CodeSize: 27310012 -> 27318708 (+0.03%); split: -0.01%, +0.05% VGPRs: 97236 -> 97200 (-0.04%); split: -0.05%, +0.01% Latency: 34121873 -> 34120894 (-0.00%); split: -0.02%, +0.01% InvThroughput: 6735276 -> 6730418 (-0.07%); split: -0.08%, +0.01% VClause: 130106 -> 130090 (-0.01%); split: -0.05%, +0.04% SClause: 90439 -> 90449 (+0.01%); split: -0.00%, +0.01% Copies: 382920 -> 382401 (-0.14%); split: -0.18%, +0.05% Branches: 130089 -> 130091 (+0.00%) PreSGPRs: 67745 -> 67743 (-0.00%); split: -0.01%, +0.00% PreVGPRs: 72710 -> 72674 (-0.05%) VALU: 2941866 -> 2938129 (-0.13%); split: -0.13%, +0.00% SALU: 651032 -> 651779 (+0.11%); split: -0.02%, +0.14% VOPD: 2446 -> 2393 (-2.17%); split: +0.70%, -2.86% Foz-DB Navi21: Totals from 1534 (1.86% of 82387) affected shaders: MaxWaves: 32481 -> 32479 (-0.01%) Instrs: 4732755 -> 4730039 (-0.06%); split: -0.06%, +0.00% CodeSize: 25305728 -> 25313148 (+0.03%); split: -0.00%, +0.03% VGPRs: 84424 -> 84448 (+0.03%) SpillVGPRs: 2420 -> 2419 (-0.04%) Scratch: 180224 -> 179200 (-0.57%) Latency: 36843383 -> 36846269 (+0.01%); split: -0.01%, +0.02% InvThroughput: 9252495 -> 9238142 (-0.16%); split: -0.17%, +0.02% VClause: 146629 -> 146671 (+0.03%); split: -0.02%, +0.05% SClause: 94502 -> 94512 (+0.01%); split: -0.00%, +0.01% Copies: 403672 -> 403592 (-0.02%); split: -0.09%, +0.07% Branches: 141145 -> 141137 (-0.01%) PreSGPRs: 70003 -> 70001 (-0.00%); split: -0.01%, +0.00% PreVGPRs: 70835 -> 70800 (-0.05%) VALU: 3114513 -> 3111338 (-0.10%); split: -0.10%, +0.00% SALU: 651177 -> 651925 (+0.11%); split: -0.02%, +0.13% VMEM: 271263 -> 271261 (-0.00%) Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 102 +++------------------- src/amd/compiler/tests/test_optimizer.cpp | 4 +- 2 files changed, 15 insertions(+), 91 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index bc50e9e40bc..32e6b16058c 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3585,75 +3585,6 @@ combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, return false; } -/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */ -bool -combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr& instr) -{ - bool is_or = instr->opcode == aco_opcode::v_or_b32; - aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32; - - if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32, - "120", 1 | 2)) - return true; - if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32, - "120", 1 | 2)) - return true; - if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2)) - return true; - if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2)) - return true; - - if (instr->isSDWA() || instr->isDPP()) - return false; - - /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b) - * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b) - * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b) - * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b) - */ - for (unsigned i = 0; i < 2; i++) { - Instruction* extins = follow_operand(ctx, instr->operands[i]); - if (!extins) - continue; - - aco_opcode op; - Operand operands[3]; - - if (extins->opcode == aco_opcode::p_insert && - (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) { - op = new_op_lshl; - operands[1] = - Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue()); - } else if (is_or && - (extins->opcode == aco_opcode::p_insert || - (extins->opcode == aco_opcode::p_extract && - extins->operands[3].constantEquals(0))) && - extins->operands[1].constantEquals(0)) { - op = aco_opcode::v_and_or_b32; - operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu); - } else { - continue; - } - - operands[0] = extins->operands[0]; - operands[2] = instr->operands[!i]; - - if (!check_vop3_operands(ctx, 3, operands)) - continue; - - uint8_t neg = 0, abs = 0, opsel = 0, omod = 0; - bool clamp = false; - if (instr->isVOP3()) - clamp = instr->valu().clamp; - - ctx.uses[instr->operands[i].tempId()]--; - create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod); - return true; - } - - return false; -} - /* v_not(v_xor(a, b)) -> v_xnor(a, b) */ bool combine_not_xor(opt_ctx& ctx, aco_ptr& instr) @@ -4684,26 +4615,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) { combine_not_xor(ctx, instr); } else if (instr->opcode == aco_opcode::v_add_u32 && !instr->usesModifiers()) { - if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) { - } else if (combine_add_bcnt(ctx, instr)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24, - aco_opcode::v_mad_u32_u24, "120", 1 | 2)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_i32_i24, - aco_opcode::v_mad_i32_i24, "120", 1 | 2)) { - } else if (ctx.program->gfx_level >= GFX9) { - if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", - 1 | 2)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, - "120", 1 | 2)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32, - "012", 1 | 2)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, - "012", 1 | 2)) { - } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, - "012", 1 | 2)) { - } else if (combine_add_or_then_and_lshl(ctx, instr)) { - } - } + combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2); } else if ((instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_add_co_u32_e64) && !instr->usesModifiers()) { @@ -4934,6 +4846,18 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) add_opt(s_xor_b32, v_xor3_b32, 0x3, "012", nullptr, true); add_opt(v_not_b32, v_xnor_b32, 0x3, "01", nullptr, true); add_opt(s_not_b32, v_xnor_b32, 0x3, "01", nullptr, true); + } else if (info.opcode == aco_opcode::v_add_u32 && !info.clamp) { + assert(ctx.program->gfx_level >= GFX9); + add_opt(v_bcnt_u32_b32, v_bcnt_u32_b32, 0x3, "102", remove_const_cb<0>, true); + add_opt(v_mul_u32_u24, v_mad_u32_u24, 0x3, "120", nullptr, true); + add_opt(v_mul_i32_i24, v_mad_i32_i24, 0x3, "120", nullptr, true); + add_opt(v_xor_b32, v_xad_u32, 0x3, "120", nullptr, true); + add_opt(s_xor_b32, v_xad_u32, 0x3, "120", nullptr, true); + add_opt(v_add_u32, v_add3_u32, 0x3, "012", nullptr, true); + add_opt(s_add_u32, v_add3_u32, 0x3, "012", nullptr, true); + add_opt(s_add_i32, v_add3_u32, 0x3, "012", nullptr, true); + add_opt(v_lshlrev_b32, v_lshl_add_u32, 0x3, "210", nullptr, true); + add_opt(s_lshl_b32, v_lshl_add_u32, 0x3, "120", nullptr, true); } if (match_and_apply_patterns(ctx, info, patterns)) { diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 6e41b067ab2..861bf6ce9fc 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -363,9 +363,9 @@ BEGIN_TEST(optimize.bcnt) bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero()); writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u))); - //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0 + //~gfx8! v1: %bnct3 = v_bcnt_u32_b32 %b, 0 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a - //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a + //~gfx(9|10)! v1: %res3 = v_bcnt_u32_b32 %b, %a //! p_unit_test 3, %res3 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero()); writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));