aco/optimizer: optimize extract(extract())

Totals from 53 (0.04% of 134572) affected shaders: (GFX10.3)
SpillVGPRs: 1780 -> 1776 (-0.22%); split: -0.34%, +0.11%
CodeSize: 968352 -> 963196 (-0.53%); split: -0.55%, +0.02%
Scratch: 180224 -> 178176 (-1.14%)
Instrs: 169800 -> 169158 (-0.38%); split: -0.39%, +0.01%
Latency: 6186064 -> 6141408 (-0.72%); split: -1.16%, +0.44%
InvThroughput: 2605044 -> 2582967 (-0.85%); split: -1.37%, +0.52%
VClause: 4851 -> 4866 (+0.31%); split: -0.16%, +0.47%
SClause: 1744 -> 1746 (+0.11%)
Copies: 42874 -> 42325 (-1.28%); split: -1.40%, +0.12%
Branches: 5762 -> 5765 (+0.05%); split: -0.02%, +0.07%

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13576>
This commit is contained in:
Daniel Schürmann 2021-10-05 13:09:02 +01:00 committed by Marge Bot
parent 5ad9c20d4a
commit fb622775b5

View file

@ -527,6 +527,7 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
return false;
break;
case aco_opcode::p_extract_vector:
case aco_opcode::p_extract:
if (temp.type() == RegType::sgpr && !can_accept_sgpr)
return false;
break;
@ -991,9 +992,21 @@ can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_i
can_use_opsel(ctx.program->chip_class, instr->opcode, idx, sel.offset()) &&
!(instr->vop3().opsel & (1 << idx))) {
return true;
} else {
return false;
} else if (instr->opcode == aco_opcode::p_extract) {
SubdwordSel instrSel = parse_extract(instr.get());
/* the outer offset must be within extracted range */
if (instrSel.offset() >= sel.size())
return false;
/* don't remove the sign-extension when increasing the size further */
if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend())
return false;
return true;
}
return false;
}
/* Combine an p_extract (or p_insert, in some cases) instruction with instr.
@ -1033,6 +1046,18 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
} else if (instr->isVOP3()) {
if (sel.offset())
instr->vop3().opsel |= 1 << idx;
} else if (instr->opcode == aco_opcode::p_extract) {
SubdwordSel instrSel = parse_extract(instr.get());
unsigned size = std::min(sel.size(), instrSel.size());
unsigned offset = sel.offset() + instrSel.offset();
unsigned sign_extend =
instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size());
instr->operands[1] = Operand::c32(offset / size);
instr->operands[2] = Operand::c32(size * 8u);
instr->operands[3] = Operand::c32(sign_extend);
return;
}
/* output modifier and label_vopc seem to be the only one worth keeping at the moment */
@ -3406,8 +3431,17 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (instr->isSDWA() || instr->isDPP())
return;
if (instr->opcode == aco_opcode::p_extract)
if (instr->opcode == aco_opcode::p_extract) {
ssa_info& info = ctx.info[instr->operands[0].tempId()];
if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
apply_extract(ctx, instr, 0, info);
if (--ctx.uses[instr->operands[0].tempId()])
ctx.uses[info.instr->operands[0].tempId()]++;
instr->operands[0].setTemp(info.instr->operands[0].getTemp());
}
apply_ds_extract(ctx, instr);
}
/* TODO: There are still some peephole optimizations that could be done:
* - abs(a - b) -> s_absdiff_i32