From 7108dac637249cb6bb7fa19eee84283bbe1fb252 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sat, 14 Dec 2024 19:40:51 +0100 Subject: [PATCH] aco/optimizer: use new helpers for s_lshl_add_u32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi48: Totals from 7654 (9.29% of 82419) affected shaders: Instrs: 6170479 -> 6174536 (+0.07%); split: -0.07%, +0.13% CodeSize: 32489580 -> 32500100 (+0.03%); split: -0.07%, +0.10% SpillSGPRs: 4253 -> 4224 (-0.68%); split: -0.71%, +0.02% Latency: 60472662 -> 60489681 (+0.03%); split: -0.02%, +0.04% InvThroughput: 9218099 -> 9218149 (+0.00%); split: -0.01%, +0.01% VClause: 121094 -> 121089 (-0.00%); split: -0.01%, +0.00% SClause: 178092 -> 179830 (+0.98%); split: -0.55%, +1.53% Copies: 424495 -> 423756 (-0.17%); split: -0.57%, +0.40% Branches: 120352 -> 120353 (+0.00%); split: -0.01%, +0.01% PreSGPRs: 334391 -> 333381 (-0.30%); split: -0.33%, +0.02% VALU: 3349394 -> 3349323 (-0.00%); split: -0.00%, +0.00% SALU: 957913 -> 957149 (-0.08%); split: -0.25%, +0.17% VOPD: 9177 -> 9179 (+0.02%); split: +0.03%, -0.01% Foz-DB Navi21: Totals from 7649 (9.28% of 82387) affected shaders: Instrs: 6144605 -> 6143005 (-0.03%); split: -0.06%, +0.04% CodeSize: 32685976 -> 32672380 (-0.04%); split: -0.08%, +0.04% SpillSGPRs: 3079 -> 3067 (-0.39%); split: -0.42%, +0.03% Latency: 64979945 -> 65002741 (+0.04%); split: -0.02%, +0.05% InvThroughput: 14754398 -> 14754230 (-0.00%); split: -0.01%, +0.01% VClause: 132336 -> 132357 (+0.02%); split: -0.02%, +0.03% SClause: 190229 -> 191340 (+0.58%); split: -1.01%, +1.60% Copies: 511915 -> 511287 (-0.12%); split: -0.44%, +0.32% Branches: 157156 -> 157154 (-0.00%); split: -0.01%, +0.01% PreSGPRs: 345761 -> 344826 (-0.27%); split: -0.33%, +0.05% VALU: 3856887 -> 3856928 (+0.00%); split: -0.01%, +0.01% SALU: 1001190 -> 1000362 (-0.08%); split: -0.22%, +0.14% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 55 ++++-------------------------- 1 file changed, 7 insertions(+), 48 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 947a39c1d12..2713cf5a611 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3038,14 +3038,6 @@ original_temp_id(opt_ctx& ctx, Temp tmp) return tmp.id(); } -Operand -copy_operand(opt_ctx& ctx, Operand op) -{ - if (op.isTemp()) - ctx.uses[op.tempId()]++; - return op; -} - Instruction* follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false) { @@ -3697,43 +3689,6 @@ combine_salu_n2(opt_ctx& ctx, aco_ptr& instr) return false; } -/* s_add_{i32,u32}(a, s_lshl_b32(b, )) -> s_lshl_add_u32(a, b) */ -bool -combine_salu_lshl_add(opt_ctx& ctx, aco_ptr& instr) -{ - if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()]) - return false; - - for (unsigned i = 0; i < 2; i++) { - Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true); - if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 || - ctx.uses[op2_instr->definitions[1].tempId()]) - continue; - if (!op2_instr->operands[1].isConstant()) - continue; - - uint32_t shift = op2_instr->operands[1].constantValue(); - if (shift < 1 || shift > 4) - continue; - - if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() && - instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue()) - continue; - - instr->operands[1] = instr->operands[!i]; - instr->operands[0] = copy_operand(ctx, op2_instr->operands[0]); - decrease_and_dce(ctx, op2_instr->definitions[0].getTemp()); - ctx.info[instr->definitions[0].tempId()].label = 0; - - instr->opcode = std::array{ - aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32, - aco_opcode::s_lshl4_add_u32}[shift - 1]; - - return true; - } - return false; -} - /* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b) * s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b) */ @@ -4642,9 +4597,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) { combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2); - } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && - ctx.program->gfx_level >= GFX9) { - combine_salu_lshl_add(ctx, instr); } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) { if (!combine_salu_not_bitwise(ctx, instr)) combine_inverse_comparison(ctx, instr); @@ -4859,6 +4811,13 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) add_opt(s_add_i32, v_add3_u32, 0x3, "012", nullptr, true); add_opt(v_lshlrev_b32, v_lshl_add_u32, 0x3, "210", nullptr, true); add_opt(s_lshl_b32, v_lshl_add_u32, 0x3, "120", nullptr, true); + } else if ((info.opcode == aco_opcode::s_add_u32 || + (info.opcode == aco_opcode::s_add_i32 && !ctx.uses[info.defs[1].tempId()])) && + ctx.program->gfx_level >= GFX9) { + add_opt(s_lshl_b32, s_lshl1_add_u32, 0x3, "102", remove_const_cb<1>); + add_opt(s_lshl_b32, s_lshl2_add_u32, 0x3, "102", remove_const_cb<2>); + add_opt(s_lshl_b32, s_lshl3_add_u32, 0x3, "102", remove_const_cb<3>); + add_opt(s_lshl_b32, s_lshl4_add_u32, 0x3, "102", remove_const_cb<4>); } if (match_and_apply_patterns(ctx, info, patterns)) {