nv50/ir: optimize imul/imad to xmads

This hits the shader-db numbers a good bit, though a few xmads is way
faster than an imul or imad and the cost is mitigated by the next commit,
which optimizes many multiplications by immediates into shorter and less
register heavy instructions than the xmads.

total instructions in shared programs : 5768871 -> 5820882 (0.90%)
total gprs used in shared programs    : 669919 -> 670595 (0.10%)
total shared used in shared programs  : 548832 -> 548832 (0.00%)
total local used in shared programs   : 21068 -> 21164 (0.46%)

                local     shared        gpr       inst      bytes
    helped           0           0          38           0           0
      hurt           1           0         365        3076        3076

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com>
This commit is contained in:
Rhys Perry 2018-06-13 16:25:23 +01:00
parent bcbcdf8448
commit b60bc7a4ab
2 changed files with 56 additions and 1 deletions

View file

@ -2301,13 +2301,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
// =============================================================================
// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
// MUL(a, b) -> a few XMADs
// MAD/FMA(a, b, c) -> a few XMADs
class LateAlgebraicOpt : public Pass
{
private:
virtual bool visit(Instruction *);
void handleADD(Instruction *);
void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
BuildUtil bld;
};
void
@ -2368,6 +2373,52 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
return true;
}
// MUL(a, b) -> a few XMADs
// MAD/FMA(a, b, c) -> a few XMADs
void
LateAlgebraicOpt::handleMULMAD(Instruction *i)
{
// TODO: handle NV50_IR_SUBOP_MUL_HIGH
if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
return;
if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
return;
if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
return;
assert(!i->src(0).mod);
assert(!i->src(1).mod);
assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
bld.setPosition(i, false);
Value *a = i->getSrc(0);
Value *b = i->getSrc(1);
Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
Value *tmp0 = bld.getSSA();
Value *tmp1 = bld.getSSA();
Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
insn->setPredicate(i->cc, i->getPredicate());
insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
insn->setPredicate(i->cc, i->getPredicate());
insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
Value *pred = i->getPredicate();
i->setPredicate(i->cc, NULL);
i->op = OP_XMAD;
i->setSrc(0, b);
i->setSrc(1, tmp1);
i->setSrc(2, tmp0);
i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
i->setPredicate(i->cc, pred);
}
bool
LateAlgebraicOpt::visit(Instruction *i)
{
@ -2375,6 +2426,11 @@ LateAlgebraicOpt::visit(Instruction *i)
case OP_ADD:
handleADD(i);
break;
case OP_MUL:
case OP_MAD:
case OP_FMA:
handleMULMAD(i);
break;
default:
break;
}

View file

@ -170,7 +170,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const
}
break;
case OPCLASS_ARITH:
// TODO: IMUL/IMAD require barriers too, use of XMAD instead!
if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
!isFloatType(insn->dType))
return true;