nv50/ir: fix s32 x s32 -> high s32 multiply logic

Retrieving the high 32 bits of a signed multiply is rather annoying. It
appears that the simplest way to do this is to compute the absolute
value of the arguments, and perform a u32 x u32 -> u64 operation. If the
arguments' signs differ, then negate the result. Since there is no u64
support in the cvt instruction, we have the perform the 2's complement
negation "by hand".

This logic can come into use by the IMUL_HI instruction (very unlikely
to be seen), as well as from constant folding of division by a constant.
Fixes dolphin's divisions by 255.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "10.1 10.2" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ben Skeggs <bskeggs@redhat.com>
(cherry picked from commit d3a5cf052c)
This commit is contained in:
Ilia Mirkin 2014-05-14 23:30:16 -04:00 committed by Carl Worth
parent a23e73e00d
commit 5d8e60dcc7
2 changed files with 82 additions and 11 deletions

View file

@ -37,18 +37,25 @@ namespace nv50_ir {
// ah*bl 00
//
// fffe0001 + fffe0001
//
// Note that this sort of splitting doesn't work for signed values, so we
// compute the sign on those manually and then perform an unsigned multiply.
static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
DataType fTy = mul->sType; // full type
DataType hTy;
DataType fTy; // full type
switch (mul->sType) {
case TYPE_S32: fTy = TYPE_U32; break;
case TYPE_S64: fTy = TYPE_U64; break;
default: fTy = mul->sType; break;
}
DataType hTy; // half type
switch (fTy) {
case TYPE_S32: hTy = TYPE_S16; break;
case TYPE_U32: hTy = TYPE_U16; break;
case TYPE_U64: hTy = TYPE_U32; break;
case TYPE_S64: hTy = TYPE_S32; break;
default:
return false;
}
@ -59,15 +66,25 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
bld->setPosition(mul, true);
Value *s[2];
Value *a[2], *b[2];
Value *c[2];
Value *t[4];
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
s[0] = mul->getSrc(0);
s[1] = mul->getSrc(1);
if (isSignedType(mul->sType)) {
s[0] = bld->getSSA(fullSize);
s[1] = bld->getSSA(fullSize);
bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
}
// split sources into halves
i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
i[0] = bld->mkSplit(a, halfSize, s[0]);
i[1] = bld->mkSplit(b, halfSize, s[1]);
i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@ -75,24 +92,76 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
if (highResult) {
Value *r[4];
Value *c[2];
Value *r[5];
Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
c[0] = bld->getSSA(1, FILE_FLAGS);
c[1] = bld->getSSA(1, FILE_FLAGS);
for (int j = 0; j < 4; ++j)
for (int j = 0; j < 5; ++j)
r[j] = bld->getSSA(fullSize);
i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
// set carry defs / sources
i[3]->setFlagsDef(1, c[0]);
i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
// actual result required in negative case, but ignored for
// unsigned. for some reason the compiler ends up dropping the whole
// instruction if the destination is unused but the flags are.
if (isSignedType(mul->sType))
i[4]->setFlagsDef(1, c[1]);
else
i[4]->setFlagsDef(0, c[1]);
i[6]->setPredicate(CC_C, c[0]);
i[5]->setFlagsSrc(3, c[1]);
if (isSignedType(mul->sType)) {
Value *cc[2];
Value *rr[7];
Value *one = bld->getSSA(fullSize);
bld->loadImm(one, 1);
for (int j = 0; j < 7; j++)
rr[j] = bld->getSSA(fullSize);
// NOTE: this logic uses predicates because splitting basic blocks is
// ~impossible during the SSA phase. The RA relies on a correlation
// between edge order and phi node sources.
// Set the sign of the result based on the inputs
bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
// 1s complement of 64-bit value
bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
->setPredicate(CC_S, cc[0]);
bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
->setPredicate(CC_S, cc[0]);
// add to low 32-bits, keep track of the carry
Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
n->setPredicate(CC_S, cc[0]);
n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
// If there was a carry, add 1 to the upper 32 bits
// XXX: These get executed even if they shouldn't be
bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
->setPredicate(CC_C, cc[1]);
bld->mkMov(rr[3], rr[0])
->setPredicate(CC_NC, cc[1]);
bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
// Merge the results from the negative and non-negative paths
bld->mkMov(rr[5], rr[4])
->setPredicate(CC_S, cc[0]);
bld->mkMov(rr[6], r[4])
->setPredicate(CC_NS, cc[0]);
bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
} else {
bld->mkMov(mul->getDef(0), r[4]);
}
} else {
bld->mkMov(mul->getDef(0), t[3]);
}

View file

@ -329,6 +329,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
return false;
if (sf == FILE_IMMEDIATE)
return false;
if (i->subOp == NV50_IR_SUBOP_MUL_HIGH && sf == FILE_MEMORY_CONST)
return false;
ldSize = 2;
} else {
ldSize = typeSizeof(ld->dType);