diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index bc842a58906..1fd3cf61941 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -527,6 +527,7 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr& instr, Temp temp, unsi return false; break; case aco_opcode::p_extract_vector: + case aco_opcode::p_extract: if (temp.type() == RegType::sgpr && !can_accept_sgpr) return false; break; @@ -991,9 +992,21 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i can_use_opsel(ctx.program->chip_class, instr->opcode, idx, sel.offset()) && !(instr->vop3().opsel & (1 << idx))) { return true; - } else { - return false; + } else if (instr->opcode == aco_opcode::p_extract) { + SubdwordSel instrSel = parse_extract(instr.get()); + + /* the outer offset must be within extracted range */ + if (instrSel.offset() >= sel.size()) + return false; + + /* don't remove the sign-extension when increasing the size further */ + if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend()) + return false; + + return true; } + + return false; } /* Combine an p_extract (or p_insert, in some cases) instruction with instr. @@ -1033,6 +1046,18 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& } else if (instr->isVOP3()) { if (sel.offset()) instr->vop3().opsel |= 1 << idx; + } else if (instr->opcode == aco_opcode::p_extract) { + SubdwordSel instrSel = parse_extract(instr.get()); + + unsigned size = std::min(sel.size(), instrSel.size()); + unsigned offset = sel.offset() + instrSel.offset(); + unsigned sign_extend = + instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size()); + + instr->operands[1] = Operand::c32(offset / size); + instr->operands[2] = Operand::c32(size * 8u); + instr->operands[3] = Operand::c32(sign_extend); + return; } /* output modifier and label_vopc seem to be the only one worth keeping at the moment */ @@ -3406,8 +3431,17 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSDWA() || instr->isDPP()) return; - if (instr->opcode == aco_opcode::p_extract) + if (instr->opcode == aco_opcode::p_extract) { + ssa_info& info = ctx.info[instr->operands[0].tempId()]; + if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) { + apply_extract(ctx, instr, 0, info); + if (--ctx.uses[instr->operands[0].tempId()]) + ctx.uses[info.instr->operands[0].tempId()]++; + instr->operands[0].setTemp(info.instr->operands[0].getTemp()); + } + apply_ds_extract(ctx, instr); + } /* TODO: There are still some peephole optimizations that could be done: * - abs(a - b) -> s_absdiff_i32