From ee28801eae5cfda1fe7c4e5be37d0925105e548c Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 9 Jan 2025 22:04:42 +0100 Subject: [PATCH] aco/optimizer: use new helpers to apply insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi21: Totals from 505 (0.52% of 97591) affected shaders: Instrs: 1438254 -> 1436780 (-0.10%); split: -0.11%, +0.01% CodeSize: 8063364 -> 8054192 (-0.11%); split: -0.13%, +0.01% Latency: 18596788 -> 18597262 (+0.00%); split: -0.01%, +0.01% InvThroughput: 5213861 -> 5213061 (-0.02%); split: -0.02%, +0.01% VClause: 37121 -> 37130 (+0.02%) Copies: 174744 -> 175222 (+0.27%); split: -0.07%, +0.34% Branches: 65722 -> 65718 (-0.01%) VALU: 912967 -> 911074 (-0.21%); split: -0.21%, +0.00% SALU: 251045 -> 251560 (+0.21%); split: -0.01%, +0.21% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 86 ++++++++++-------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index e8f378ea925..b1c9e071f01 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -70,12 +70,11 @@ enum Label { label_omod4 = 1ull << 34, label_omod5 = 1ull << 35, label_clamp = 1ull << 36, - label_insert = 1ull << 38, label_f2f16 = 1ull << 39, }; static constexpr uint64_t instr_mod_labels = - label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16; + label_omod2 | label_omod4 | label_omod5 | label_clamp | label_f2f16; static constexpr uint64_t input_mod_labels = label_abs_fp16 | label_abs_fp32_64 | label_neg_fp16 | label_neg_fp32_64; @@ -292,16 +291,6 @@ struct ssa_info { void set_extract() { add_label(label_extract); } bool is_extract() { return label & label_extract; } - - void set_insert(Instruction* insert) - { - if (label & temp_labels) - return; - add_label(label_insert); - mod_instr = insert; - } - - bool is_insert() { return label & label_insert; } }; struct opt_ctx { @@ -986,7 +975,8 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info) if (!info.uses_insert()) { info.insert = SubdwordSel::dword; } else if (info.defs[0].bytes() != 4 || - (!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2))) { + (!format_is(info.format, Format::VOP1) && !format_is(info.format, Format::VOP2)) || + ctx.program->gfx_level < GFX8 || ctx.program->gfx_level >= GFX11) { return false; } else { info.format = format_combine(info.format, Format::SDWA); @@ -3018,19 +3008,12 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::p_extract: { if (instr->operands[0].isTemp()) { ctx.info[instr->definitions[0].tempId()].set_extract(); - if (instr->definitions[0].bytes() == 4 && instr->operands[0].regClass() == v1 && - parse_insert(instr.get())) - ctx.info[instr->operands[0].tempId()].set_insert(instr.get()); } break; } case aco_opcode::p_insert: { - if (instr->operands[0].isTemp()) { - if (instr->operands[0].regClass() == v1) - ctx.info[instr->operands[0].tempId()].set_insert(instr.get()); - if (parse_extract(instr.get())) - ctx.info[instr->definitions[0].tempId()].set_extract(); - } + if (instr->operands[0].isTemp() && parse_extract(instr.get())) + ctx.info[instr->definitions[0].tempId()].set_extract(); break; } case aco_opcode::v_cvt_f16_f32: { @@ -3600,7 +3583,7 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) instr->valu().clamp = true; instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]); - ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16; + ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_f2f16; ctx.uses[def_info.mod_instr->definitions[0].tempId()]--; ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get(); ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr; @@ -3609,45 +3592,34 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) } /* Combine an p_insert (or p_extract, in some cases) instruction with instr. - * p_insert(instr(...)) -> instr_insert(). + * p_insert(parent(...)) -> instr_insert(). */ -bool -apply_insert(opt_ctx& ctx, aco_ptr& instr) +Instruction* +apply_insert(opt_ctx& ctx, aco_ptr& instr, Instruction* parent) { - if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1) - return false; + if (instr->definitions[0].regClass() != v1) + return nullptr; - ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; - if (!def_info.is_insert()) - return false; - /* if the insert instruction is dead, then the single user of this - * instruction is a different instruction */ - if (!ctx.uses[def_info.mod_instr->definitions[0].tempId()]) - return false; + SubdwordSel sel = parse_insert(instr.get()); + if (!sel) + return nullptr; - /* MADs/FMAs are created later, so we don't have to update the original add */ - assert(!ctx.info[instr->definitions[0].tempId()].is_combined()); + if (ctx.info[instr->operands[0].tempId()].label & temp_labels) + return nullptr; - SubdwordSel sel = parse_insert(def_info.mod_instr); - assert(sel); + alu_opt_info parent_info; + if (!alu_opt_gather_info(ctx, parent, parent_info)) + return nullptr; - if (!can_use_SDWA(ctx.program->gfx_level, instr, true)) - return false; + if (parent_info.uses_insert()) + return nullptr; - convert_to_SDWA(ctx.program->gfx_level, instr); - if (instr->sdwa().dst_sel.size() != 4) - return false; - instr->sdwa().dst_sel = sel; + parent_info.insert = sel; - instr->definitions[0].swapTemp(def_info.mod_instr->definitions[0]); - ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.uses[def_info.mod_instr->definitions[0].tempId()]--; - ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[def_info.mod_instr->definitions[0].tempId()].parent_instr = def_info.mod_instr; - for (const Definition& def : instr->definitions) - ctx.info[def.tempId()].parent_instr = instr.get(); - - return true; + parent_info.defs[0].setTemp(instr->definitions[0].getTemp()); + if (!alu_opt_info_is_valid(ctx, parent_info)) + return nullptr; + return alu_opt_info_to_instr(ctx, parent_info, parent); } /* Remove superfluous extract after ds_read like so: @@ -3946,8 +3918,8 @@ apply_output_impl(opt_ctx& ctx, aco_ptr& instr, Instruction* parent if (instr->opcode == aco_opcode::p_extract && (parent->isDS() || parent->isSMEM() || parent->isMUBUF() || parent->isFlatLike())) return apply_load_extract(ctx, instr, parent); - else if (instr->opcode == aco_opcode::p_extract) - return nullptr; + else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) + return apply_insert(ctx, instr, parent); else if (instr->opcode == aco_opcode::v_not_b32) return apply_v_not(ctx, instr, parent); else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) @@ -3969,6 +3941,7 @@ apply_output(opt_ctx& ctx, aco_ptr& instr) { switch (instr->opcode) { case aco_opcode::p_extract: + case aco_opcode::p_insert: case aco_opcode::v_not_b32: case aco_opcode::s_not_b32: case aco_opcode::s_not_b64: @@ -4244,7 +4217,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isVALU()) { while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr)) ; - apply_insert(ctx, instr); } if (instr->isDPP())