mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-07 03:40:27 +01:00
aco/optimizer: rework how dpp is applied
Using the common helpers means we can use VINTERP instead of DPP, which has higher throughput and smaller CodeSize. Foz-DB Navi48: Totals from 986 (1.20% of 82405) affected shaders: Instrs: 1985282 -> 1985545 (+0.01%); split: -0.01%, +0.02% CodeSize: 11179700 -> 11151780 (-0.25%); split: -0.26%, +0.01% Latency: 19899190 -> 19897694 (-0.01%); split: -0.01%, +0.01% InvThroughput: 4110650 -> 4104911 (-0.14%) VClause: 44143 -> 44139 (-0.01%); split: -0.03%, +0.02% Copies: 164340 -> 164344 (+0.00%); split: -0.02%, +0.02% VALU: 1061904 -> 1061908 (+0.00%); split: -0.00%, +0.00% SALU: 305980 -> 305974 (-0.00%) Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39516>
This commit is contained in:
parent
228cb29dae
commit
bb6a3e2891
1 changed files with 59 additions and 57 deletions
|
|
@ -4959,79 +4959,81 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
|
||||
/* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
|
||||
if (instr->isVALU() && !instr->isDPP()) {
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
if (!instr->operands[i].isTemp())
|
||||
continue;
|
||||
ssa_info info = ctx.info[instr->operands[i].tempId()];
|
||||
if (instr->isVALU() && std::any_of(instr->operands.begin(), instr->operands.end(),
|
||||
[&](const Operand& op)
|
||||
{
|
||||
if (!op.isTemp())
|
||||
return false;
|
||||
Instruction* parent = ctx.info[op.tempId()].parent_instr;
|
||||
return parent->isDPP() &&
|
||||
parent->opcode == aco_opcode::v_mov_b32 &&
|
||||
parent->pass_flags == instr->pass_flags;
|
||||
})) {
|
||||
|
||||
if (!info.parent_instr->isDPP() || info.parent_instr->opcode != aco_opcode::v_mov_b32 ||
|
||||
info.parent_instr->pass_flags != instr->pass_flags)
|
||||
alu_opt_info input_info;
|
||||
if (!alu_opt_gather_info(ctx, instr.get(), input_info))
|
||||
return;
|
||||
|
||||
alu_opt_info dpp_info;
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < input_info.operands.size(); i++) {
|
||||
if (!input_info.operands[i].op.isTemp())
|
||||
continue;
|
||||
Instruction* parent = ctx.info[input_info.operands[i].op.tempId()].parent_instr;
|
||||
|
||||
if (!parent->isDPP() || parent->opcode != aco_opcode::v_mov_b32 ||
|
||||
parent->pass_flags != instr->pass_flags)
|
||||
continue;
|
||||
|
||||
/* We won't eliminate the DPP mov if the operand is used twice */
|
||||
bool op_used_twice = false;
|
||||
for (unsigned j = 0; j < instr->operands.size(); j++)
|
||||
op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
|
||||
for (unsigned j = 0; j < input_info.operands.size(); j++)
|
||||
op_used_twice |= i != j && input_info.operands[i].op == input_info.operands[j].op;
|
||||
if (op_used_twice)
|
||||
continue;
|
||||
|
||||
bool dpp8 = info.parent_instr->isDPP8();
|
||||
bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i) &&
|
||||
get_operand_type(instr, i).bit_size == 32;
|
||||
bool mov_uses_mods = info.parent_instr->valu().neg[0] || info.parent_instr->valu().abs[0];
|
||||
if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
|
||||
if (input_info.operands[i].dpp16 || input_info.operands[i].dpp8)
|
||||
continue;
|
||||
|
||||
Format old_format = instr->format;
|
||||
if (i != 0) {
|
||||
if (!instr->operands[0].isOfType(RegType::vgpr) && !instr->isVOP3P())
|
||||
instr->format = asVOP3(instr->format);
|
||||
if (!can_swap_operands(instr, &instr->opcode, 0, i)) {
|
||||
instr->format = old_format;
|
||||
continue;
|
||||
}
|
||||
instr->valu().swapOperands(0, i);
|
||||
}
|
||||
alu_opt_op outer;
|
||||
outer.op = parent->operands[0];
|
||||
outer.neg[0] = parent->valu().neg[0];
|
||||
outer.abs[0] = parent->valu().abs[0];
|
||||
aco_type outer_type = {aco_base_type_uint, 1, 32};
|
||||
|
||||
if (!can_use_DPP(ctx.program->gfx_level, instr, dpp8)) {
|
||||
if (i != 0) {
|
||||
ASSERTED bool success = can_swap_operands(instr, &instr->opcode, 0, i);
|
||||
assert(success);
|
||||
instr->valu().swapOperands(0, i);
|
||||
instr->format = old_format;
|
||||
}
|
||||
alu_opt_op inner = input_info.operands[i];
|
||||
aco_type inner_type = get_canonical_operand_type(input_info.opcode, i);
|
||||
if (inner.f16_to_f32)
|
||||
inner_type.bit_size = 16;
|
||||
if (!combine_operand(ctx, inner, inner_type, outer, outer_type, false))
|
||||
continue;
|
||||
|
||||
if (parent->isDPP16()) {
|
||||
inner.dpp16 = true;
|
||||
inner.dpp_ctrl = parent->dpp16().dpp_ctrl;
|
||||
inner.fi = parent->dpp16().fetch_inactive;
|
||||
inner.bc = parent->dpp16().bound_ctrl;
|
||||
assert(parent->dpp16().row_mask == 0xf && parent->dpp16().bank_mask == 0xf);
|
||||
} else if (parent->isDPP8()) {
|
||||
inner.dpp8 = true;
|
||||
inner.dpp_ctrl = parent->dpp8().lane_sel;
|
||||
inner.fi = parent->dpp8().fetch_inactive;
|
||||
}
|
||||
|
||||
convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
|
||||
alu_opt_info candidate = input_info;
|
||||
candidate.operands[i] = inner;
|
||||
if (!alu_opt_info_is_valid(ctx, candidate))
|
||||
continue;
|
||||
|
||||
if (dpp8) {
|
||||
DPP8_instruction* dpp = &instr->dpp8();
|
||||
dpp->lane_sel = info.parent_instr->dpp8().lane_sel;
|
||||
dpp->fetch_inactive = info.parent_instr->dpp8().fetch_inactive;
|
||||
if (mov_uses_mods && !instr->isVOP3P())
|
||||
instr->format = asVOP3(instr->format);
|
||||
} else {
|
||||
DPP16_instruction* dpp = &instr->dpp16();
|
||||
/* anything else doesn't make sense in SSA */
|
||||
assert(info.parent_instr->dpp16().row_mask == 0xf &&
|
||||
info.parent_instr->dpp16().bank_mask == 0xf);
|
||||
dpp->dpp_ctrl = info.parent_instr->dpp16().dpp_ctrl;
|
||||
dpp->bound_ctrl = info.parent_instr->dpp16().bound_ctrl;
|
||||
dpp->fetch_inactive = info.parent_instr->dpp16().fetch_inactive;
|
||||
}
|
||||
|
||||
instr->valu().neg[0] ^= info.parent_instr->valu().neg[0] && !instr->valu().abs[0];
|
||||
instr->valu().abs[0] |= info.parent_instr->valu().abs[0];
|
||||
|
||||
if (--ctx.uses[info.parent_instr->definitions[0].tempId()])
|
||||
ctx.uses[info.parent_instr->operands[0].tempId()]++;
|
||||
instr->operands[0].setTemp(info.parent_instr->operands[0].getTemp());
|
||||
for (const Definition& def : instr->definitions)
|
||||
ctx.info[def.tempId()].parent_instr = instr.get();
|
||||
break;
|
||||
if (--ctx.uses[parent->definitions[0].tempId()])
|
||||
ctx.uses[parent->operands[0].tempId()]++;
|
||||
input_info.operands[i] = inner;
|
||||
dpp_info = candidate;
|
||||
progress = true;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
instr.reset(alu_opt_info_to_instr(ctx, dpp_info, instr.release()));
|
||||
}
|
||||
|
||||
/* Use v_fma_mix for f2f32/f2f16 if it has higher throughput.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue