From 4442064449e20ef72cb8192c12e795dadb3985eb Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 9 Jan 2025 19:48:47 +0100 Subject: [PATCH] aco/optimizer: use new helpers to apply neg/abs to output of instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi21: Totals from 6765 (6.93% of 97591) affected shaders: MaxWaves: 134398 -> 134408 (+0.01%) Instrs: 9775725 -> 9768079 (-0.08%); split: -0.08%, +0.01% CodeSize: 50785228 -> 50777880 (-0.01%); split: -0.02%, +0.01% VGPRs: 445840 -> 445784 (-0.01%) SpillSGPRs: 14483 -> 14476 (-0.05%) Latency: 40232431 -> 40230284 (-0.01%); split: -0.04%, +0.03% InvThroughput: 10339051 -> 10329846 (-0.09%); split: -0.09%, +0.00% VClause: 186785 -> 186788 (+0.00%); split: -0.01%, +0.01% SClause: 157106 -> 157116 (+0.01%); split: -0.00%, +0.01% Copies: 746817 -> 745378 (-0.19%); split: -0.26%, +0.07% Branches: 189298 -> 189211 (-0.05%); split: -0.06%, +0.01% PreSGPRs: 346169 -> 346158 (-0.00%) PreVGPRs: 370712 -> 370660 (-0.01%); split: -0.02%, +0.00% VALU: 6847295 -> 6839753 (-0.11%); split: -0.11%, +0.00% SALU: 1139960 -> 1139942 (-0.00%); split: -0.00%, +0.00% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 135 +++++++++++++++-------------- 1 file changed, 68 insertions(+), 67 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index aa41eade1e6..4a5ce4947a3 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3921,6 +3921,66 @@ op_info_get_constant(opt_ctx& ctx, alu_opt_op op_info, aco_type type, uint64_t* return true; } +/* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */ +Instruction* +apply_output_mul(opt_ctx& ctx, aco_ptr& instr, Instruction* parent) +{ + alu_opt_info info; + if (!alu_opt_gather_info(ctx, instr.get(), info)) + return nullptr; + aco_type type = instr_info.alu_opcode_infos[(int)instr->opcode].def_types[0]; + + unsigned denorm_mode = type.bit_size == 32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64; + if (!ctx.info[parent->definitions[0].tempId()].is_canonicalized(type.bit_size) && + denorm_mode != fp_denorm_keep) + return nullptr; + + aco_type parent_type = instr_info.alu_opcode_infos[(int)parent->opcode].def_types[0]; + + if (type.num_components != parent_type.num_components || type.bit_size != parent_type.bit_size || + instr->definitions[0].regClass().type() != parent->definitions[0].regClass().type()) + return nullptr; + + unsigned cidx = !info.operands[0].op.isConstant(); + + uint64_t constant = 0; + if (!op_info_get_constant(ctx, info.operands[cidx], type, &constant)) + return nullptr; + + for (unsigned i = 0; i < type.num_components; i++) { + double val = extract_float(constant, type.bit_size, i); + if (val < 0.0) { + val = fabs(val); + info.operands[!cidx].neg[i] ^= true; + } + + if (val != 1.0) + return nullptr; + } + + if ((info.omod || info.clamp) && + !instr_info.alu_opcode_infos[(int)parent->opcode].output_modifiers) + return nullptr; + + alu_opt_info parent_info; + if (!alu_opt_gather_info(ctx, parent, parent_info)) + return nullptr; + + if (parent_info.uses_insert() || (info.omod && (parent_info.omod || parent_info.clamp))) + return nullptr; + + if (!backpropagate_input_modifiers(ctx, parent_info, info.operands[!cidx], type)) + return nullptr; + + parent_info.clamp |= info.clamp; + parent_info.omod |= info.omod; + parent_info.insert = info.insert; + parent_info.defs[0].setTemp(info.defs[0].getTemp()); + if (!alu_opt_info_is_valid(ctx, parent_info)) + return nullptr; + return alu_opt_info_to_instr(ctx, parent_info, parent); +} + Instruction* apply_output_impl(opt_ctx& ctx, aco_ptr& instr, Instruction* parent) { @@ -3935,6 +3995,9 @@ apply_output_impl(opt_ctx& ctx, aco_ptr& instr, Instruction* parent return apply_s_not(ctx, instr, parent); else if (instr->opcode == aco_opcode::s_abs_i32) return apply_s_abs(ctx, instr, parent); + else if (instr->opcode == aco_opcode::v_mul_f64 || instr->opcode == aco_opcode::v_mul_f64_e64 || + instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16) + return apply_output_mul(ctx, instr, parent); else UNREACHABLE("unhandled opcode"); @@ -3949,7 +4012,11 @@ apply_output(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::v_not_b32: case aco_opcode::s_not_b32: case aco_opcode::s_not_b64: - case aco_opcode::s_abs_i32: break; + case aco_opcode::s_abs_i32: + case aco_opcode::v_mul_f64: + case aco_opcode::v_mul_f64_e64: + case aco_opcode::v_mul_f32: + case aco_opcode::v_mul_f16: break; default: return false; } @@ -4195,21 +4262,6 @@ and_cb(opt_ctx& ctx, alu_opt_info& info) return func1(ctx, info) && func2(ctx, info); } -bool -is_mul(Instruction* instr) -{ - switch (instr->opcode) { - case aco_opcode::v_mul_f64_e64: - case aco_opcode::v_mul_f64: - case aco_opcode::v_mul_f32: - case aco_opcode::v_mul_legacy_f32: - case aco_opcode::v_mul_f16: return true; - case aco_opcode::v_fma_mix_f32: - return instr->operands[2].constantEquals(0) && instr->valu().neg[2]; - default: return false; - } -} - void combine_instruction(opt_ctx& ctx, aco_ptr& instr) { @@ -4258,57 +4310,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) * The various comparison optimizations also currently only work with 32-bit * floats. */ - /* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */ - if ((ctx.info[instr->definitions[0].tempId()].label & input_mod_labels) && - ctx.uses[ctx.info[instr->definitions[0].tempId()].temp.id()] == 1) { - Temp val = ctx.info[instr->definitions[0].tempId()].temp; - Instruction* mul_instr = ctx.info[val.id()].parent_instr; - - if (!is_mul(mul_instr)) - return; - - if (mul_instr->operands[0].isLiteral()) - return; - if (mul_instr->valu().clamp) - return; - if (mul_instr->isSDWA() || mul_instr->isDPP()) - return; - if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 && - mul_instr->definitions[0].isSZPreserve()) - return; - if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes()) - return; - - /* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */ - ctx.uses[mul_instr->definitions[0].tempId()]--; - Definition def = instr->definitions[0]; - bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg(def.bytes() * 8); - bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs(def.bytes() * 8); - uint32_t pass_flags = instr->pass_flags; - Format format = mul_instr->format == Format::VOP2 ? asVOP3(Format::VOP2) : mul_instr->format; - instr.reset(create_instruction(mul_instr->opcode, format, mul_instr->operands.size(), 1)); - std::copy(mul_instr->operands.cbegin(), mul_instr->operands.cend(), instr->operands.begin()); - instr->pass_flags = pass_flags; - instr->definitions[0] = def; - VALU_instruction& new_mul = instr->valu(); - VALU_instruction& mul = mul_instr->valu(); - new_mul.neg = mul.neg; - new_mul.abs = mul.abs; - new_mul.omod = mul.omod; - new_mul.opsel = mul.opsel; - new_mul.opsel_lo = mul.opsel_lo; - new_mul.opsel_hi = mul.opsel_hi; - if (is_abs) { - new_mul.neg[0] = new_mul.neg[1] = false; - new_mul.abs[0] = new_mul.abs[1] = true; - } - new_mul.neg[0] ^= is_neg; - new_mul.clamp = false; - - ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get(); - return; - } - alu_opt_info info; if (!alu_opt_gather_info(ctx, instr.get(), info)) return;