nvc0/ir: add support for all the new int64 tgsi opcodes

A few thoughts:
 - Some of that LegalizeSSA logic should really live much earlier and be
   subject to the likes of DCE and other useful passes
 - Some of the "lowering" done in from_tgsi should be done later so that
   proper optimization might be done.

However this all works and the above can be improved upon later.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
This commit is contained in:
Ilia Mirkin 2017-02-04 22:31:04 -05:00
parent 009c54aa7a
commit 1aefd6159c
6 changed files with 302 additions and 5 deletions

View file

@ -354,6 +354,14 @@ unsigned int Instruction::srcMask(unsigned int s) const
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSEQ:
case TGSI_OPCODE_DSNE:
case TGSI_OPCODE_U64SEQ:
case TGSI_OPCODE_U64SNE:
case TGSI_OPCODE_I64SLT:
case TGSI_OPCODE_U64SLT:
case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_I642F:
case TGSI_OPCODE_U642F:
switch (util_bitcount(mask)) {
case 1: return 0x3;
case 2: return 0xf;
@ -557,6 +565,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_SHL:
case TGSI_OPCODE_U2F:
case TGSI_OPCODE_U2D:
case TGSI_OPCODE_U2I64:
case TGSI_OPCODE_UADD:
case TGSI_OPCODE_UDIV:
case TGSI_OPCODE_UMOD:
@ -587,6 +596,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
return nv50_ir::TYPE_U32;
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_I2D:
case TGSI_OPCODE_I2I64:
case TGSI_OPCODE_IDIV:
case TGSI_OPCODE_IMUL_HI:
case TGSI_OPCODE_IMAX:
@ -608,6 +618,8 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_D2F:
case TGSI_OPCODE_D2I:
case TGSI_OPCODE_D2U:
case TGSI_OPCODE_D2I64:
case TGSI_OPCODE_D2U64:
case TGSI_OPCODE_DABS:
case TGSI_OPCODE_DNEG:
case TGSI_OPCODE_DADD:
@ -630,6 +642,34 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_DFLR:
case TGSI_OPCODE_DROUND:
return nv50_ir::TYPE_F64;
case TGSI_OPCODE_U64SEQ:
case TGSI_OPCODE_U64SNE:
case TGSI_OPCODE_U64SLT:
case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_U64MIN:
case TGSI_OPCODE_U64MAX:
case TGSI_OPCODE_U64ADD:
case TGSI_OPCODE_U64MUL:
case TGSI_OPCODE_U64SHL:
case TGSI_OPCODE_U64SHR:
case TGSI_OPCODE_U64DIV:
case TGSI_OPCODE_U64MOD:
case TGSI_OPCODE_U642F:
case TGSI_OPCODE_U642D:
return nv50_ir::TYPE_U64;
case TGSI_OPCODE_I64ABS:
case TGSI_OPCODE_I64SSG:
case TGSI_OPCODE_I64NEG:
case TGSI_OPCODE_I64SLT:
case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_I64MIN:
case TGSI_OPCODE_I64MAX:
case TGSI_OPCODE_I64SHR:
case TGSI_OPCODE_I64DIV:
case TGSI_OPCODE_I64MOD:
case TGSI_OPCODE_I642F:
case TGSI_OPCODE_I642D:
return nv50_ir::TYPE_S64;
default:
return nv50_ir::TYPE_F32;
}
@ -650,17 +690,35 @@ nv50_ir::DataType Instruction::inferDstType() const
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSLT:
case TGSI_OPCODE_DSNE:
case TGSI_OPCODE_I64SLT:
case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_U64SEQ:
case TGSI_OPCODE_U64SNE:
case TGSI_OPCODE_U64SLT:
case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_PK2H:
return nv50_ir::TYPE_U32;
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_U2F:
case TGSI_OPCODE_D2F:
case TGSI_OPCODE_I642F:
case TGSI_OPCODE_U642F:
case TGSI_OPCODE_UP2H:
return nv50_ir::TYPE_F32;
case TGSI_OPCODE_I2D:
case TGSI_OPCODE_U2D:
case TGSI_OPCODE_F2D:
case TGSI_OPCODE_I642D:
case TGSI_OPCODE_U642D:
return nv50_ir::TYPE_F64;
case TGSI_OPCODE_I2I64:
case TGSI_OPCODE_U2I64:
case TGSI_OPCODE_F2I64:
case TGSI_OPCODE_D2I64:
return nv50_ir::TYPE_S64;
case TGSI_OPCODE_F2U64:
case TGSI_OPCODE_D2U64:
return nv50_ir::TYPE_U64;
default:
return inferSrcType();
}
@ -676,6 +734,8 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USLT:
case TGSI_OPCODE_FSLT:
case TGSI_OPCODE_DSLT:
case TGSI_OPCODE_I64SLT:
case TGSI_OPCODE_U64SLT:
return CC_LT;
case TGSI_OPCODE_SLE:
return CC_LE;
@ -684,6 +744,8 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USGE:
case TGSI_OPCODE_FSGE:
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_U64SGE:
return CC_GE;
case TGSI_OPCODE_SGT:
return CC_GT;
@ -691,10 +753,12 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_USEQ:
case TGSI_OPCODE_FSEQ:
case TGSI_OPCODE_DSEQ:
case TGSI_OPCODE_U64SEQ:
return CC_EQ;
case TGSI_OPCODE_SNE:
case TGSI_OPCODE_FSNE:
case TGSI_OPCODE_DSNE:
case TGSI_OPCODE_U64SNE:
return CC_NEU;
case TGSI_OPCODE_USNE:
return CC_NE;
@ -832,6 +896,35 @@ static nv50_ir::operation translateOpcode(uint opcode)
NV50_IR_OPCODE_CASE(DFLR, FLOOR);
NV50_IR_OPCODE_CASE(DROUND, CVT);
NV50_IR_OPCODE_CASE(U64SEQ, SET);
NV50_IR_OPCODE_CASE(U64SNE, SET);
NV50_IR_OPCODE_CASE(U64SLT, SET);
NV50_IR_OPCODE_CASE(U64SGE, SET);
NV50_IR_OPCODE_CASE(I64SLT, SET);
NV50_IR_OPCODE_CASE(I64SGE, SET);
NV50_IR_OPCODE_CASE(I2I64, CVT);
NV50_IR_OPCODE_CASE(U2I64, CVT);
NV50_IR_OPCODE_CASE(F2I64, CVT);
NV50_IR_OPCODE_CASE(F2U64, CVT);
NV50_IR_OPCODE_CASE(D2I64, CVT);
NV50_IR_OPCODE_CASE(D2U64, CVT);
NV50_IR_OPCODE_CASE(I642F, CVT);
NV50_IR_OPCODE_CASE(U642F, CVT);
NV50_IR_OPCODE_CASE(I642D, CVT);
NV50_IR_OPCODE_CASE(U642D, CVT);
NV50_IR_OPCODE_CASE(I64MIN, MIN);
NV50_IR_OPCODE_CASE(U64MIN, MIN);
NV50_IR_OPCODE_CASE(I64MAX, MAX);
NV50_IR_OPCODE_CASE(U64MAX, MAX);
NV50_IR_OPCODE_CASE(I64ABS, ABS);
NV50_IR_OPCODE_CASE(I64NEG, NEG);
NV50_IR_OPCODE_CASE(U64ADD, ADD);
NV50_IR_OPCODE_CASE(U64MUL, MUL);
NV50_IR_OPCODE_CASE(U64SHL, SHL);
NV50_IR_OPCODE_CASE(I64SHR, SHR);
NV50_IR_OPCODE_CASE(U64SHR, SHR);
NV50_IR_OPCODE_CASE(IMUL_HI, MUL);
NV50_IR_OPCODE_CASE(UMUL_HI, MUL);
@ -3721,6 +3814,8 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
case TGSI_OPCODE_INTERP_OFFSET:
handleINTERP(dst0);
break;
case TGSI_OPCODE_I642F:
case TGSI_OPCODE_U642F:
case TGSI_OPCODE_D2I:
case TGSI_OPCODE_D2U:
case TGSI_OPCODE_D2F: {
@ -3737,16 +3832,79 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
}
break;
}
case TGSI_OPCODE_I2I64:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
dst0[c] = fetchSrc(0, c / 2);
mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(NULL, 31));
c++;
}
break;
case TGSI_OPCODE_U2I64:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
dst0[c] = fetchSrc(0, c / 2);
dst0[c + 1] = zero;
c++;
}
break;
case TGSI_OPCODE_F2I64:
case TGSI_OPCODE_F2U64:
case TGSI_OPCODE_I2D:
case TGSI_OPCODE_U2D:
case TGSI_OPCODE_F2D:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
Value *dreg = getSSA(8);
mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
Instruction *cvt = mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
if (!isFloatType(dstTy))
cvt->rnd = ROUND_Z;
mkSplit(&dst0[c], 4, dreg);
c++;
}
break;
case TGSI_OPCODE_D2I64:
case TGSI_OPCODE_D2U64:
case TGSI_OPCODE_I642D:
case TGSI_OPCODE_U642D:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = getSSA(8);
Value *dst = getSSA(8), *tmp[2];
tmp[0] = fetchSrc(0, c);
tmp[1] = fetchSrc(0, c + 1);
mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
Instruction *cvt = mkCvt(OP_CVT, dstTy, dst, srcTy, src0);
if (!isFloatType(dstTy))
cvt->rnd = ROUND_Z;
mkSplit(&dst0[c], 4, dst);
c++;
}
break;
case TGSI_OPCODE_I64NEG:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = getSSA(8);
Value *dst = getSSA(8), *tmp[2];
tmp[0] = fetchSrc(0, c);
tmp[1] = fetchSrc(0, c + 1);
mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
mkOp2(OP_SUB, dstTy, dst, zero, src0);
mkSplit(&dst0[c], 4, dst);
c++;
}
break;
case TGSI_OPCODE_I64ABS:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = getSSA(8);
Value *neg = getSSA(8), *srcComp[2], *negComp[2];
srcComp[0] = fetchSrc(0, c);
srcComp[1] = fetchSrc(0, c + 1);
mkOp2(OP_MERGE, TYPE_U64, src0, srcComp[0], srcComp[1]);
mkOp2(OP_SUB, dstTy, neg, zero, src0);
mkSplit(negComp, 4, neg);
mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c], TYPE_S32,
negComp[0], srcComp[0], srcComp[1]);
mkCmp(OP_SLCT, CC_LT, TYPE_S32, dst0[c + 1], TYPE_S32,
negComp[1], srcComp[1], srcComp[1]);
c++;
}
break;
case TGSI_OPCODE_DABS:
case TGSI_OPCODE_DNEG:
case TGSI_OPCODE_DRCP:
@ -3779,6 +3937,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
c++;
}
break;
case TGSI_OPCODE_U64SEQ:
case TGSI_OPCODE_U64SNE:
case TGSI_OPCODE_U64SLT:
case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_I64SLT:
case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_DSLT:
case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DSEQ:
@ -3800,6 +3964,46 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
}
break;
}
case TGSI_OPCODE_U64MIN:
case TGSI_OPCODE_U64MAX:
case TGSI_OPCODE_I64MIN:
case TGSI_OPCODE_I64MAX: {
dstTy = isSignedIntType(dstTy) ? TYPE_S32 : TYPE_U32;
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
Value *flag = getSSA(1, FILE_FLAGS);
src0 = fetchSrc(0, c + 1);
src1 = fetchSrc(1, c + 1);
geni = mkOp2(op, dstTy, dst0[c + 1], src0, src1);
geni->subOp = NV50_IR_SUBOP_MINMAX_HIGH;
geni->setFlagsDef(1, flag);
src0 = fetchSrc(0, c);
src1 = fetchSrc(1, c);
geni = mkOp2(op, TYPE_U32, dst0[c], src0, src1);
geni->subOp = NV50_IR_SUBOP_MINMAX_LOW;
geni->setFlagsSrc(2, flag);
c++;
}
break;
}
case TGSI_OPCODE_U64SHL:
case TGSI_OPCODE_I64SHR:
case TGSI_OPCODE_U64SHR:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = getSSA(8);
Value *dst = getSSA(8), *tmp[2];
tmp[0] = fetchSrc(0, c);
tmp[1] = fetchSrc(0, c + 1);
mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
src1 = fetchSrc(1, c / 2);
mkOp2(op, dstTy, dst, src0, src1);
mkSplit(&dst0[c], 4, dst);
c++;
}
break;
case TGSI_OPCODE_U64ADD:
case TGSI_OPCODE_U64MUL:
case TGSI_OPCODE_DADD:
case TGSI_OPCODE_DMUL:
case TGSI_OPCODE_DDIV:
@ -3873,6 +4077,22 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
mkSplit(&dst0[c], 4, dst);
c++;
}
case TGSI_OPCODE_I64SSG:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = getSSA(8);
Value *tmp[2];
tmp[0] = fetchSrc(0, c);
tmp[1] = fetchSrc(0, c + 1);
mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
val0 = getScratch();
val1 = getScratch();
mkCmp(OP_SET, CC_GT, TYPE_U32, val0, TYPE_S64, src0, zero);
mkCmp(OP_SET, CC_LT, TYPE_U32, val1, TYPE_S64, src0, zero);
mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0);
mkOp2(OP_SHR, TYPE_S32, dst0[c + 1], dst0[c], loadImm(0, 31));
c++;
}
break;
default:
ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());

View file

@ -147,6 +147,59 @@ NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
i->moveSources(arg + 1, -1);
}
void
NVC0LegalizeSSA::handleShift(Instruction *lo)
{
Instruction *hi = new_Instruction(func, lo->op, TYPE_U32);
lo->bb->insertAfter(lo, hi);
bld.setPosition(lo, false);
Value *src[2], *dst[2] = {bld.getSSA(), bld.getSSA()};
Value *dst64 = lo->getDef(0), *shift = lo->getSrc(1);
bld.mkSplit(src, 4, lo->getSrc(0));
hi->sType = lo->sType;
lo->dType = TYPE_U32;
hi->setDef(0, dst[1]);
if (lo->op == OP_SHR)
hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
lo->setDef(0, dst[0]);
bld.setPosition(hi, true);
if (lo->op == OP_SHL)
std::swap(hi, lo);
hi->setSrc(0, new_ImmediateValue(prog, 0u));
hi->setSrc(1, shift);
hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
lo->setSrc(0, src[0]);
lo->setSrc(1, shift);
lo->setSrc(2, src[1]);
bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
}
void
NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
{
DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
Value *carry;
Value *src0[2], *src1[2];
bld.setPosition(cmp, false);
bld.mkSplit(src0, 4, cmp->getSrc(0));
bld.mkSplit(src1, 4, cmp->getSrc(1));
bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
->setFlagsDef(1, (carry = bld.getSSA(1, FILE_FLAGS)));
cmp->setFlagsSrc(cmp->srcCount(), carry);
cmp->setSrc(0, src0[1]);
cmp->setSrc(1, src1[1]);
cmp->sType = hTy;
}
bool
NVC0LegalizeSSA::visit(Function *fn)
{
@ -179,6 +232,18 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
case OP_TXF:
handleTEXLOD(i->asTex());
break;
case OP_SHR:
case OP_SHL:
if (typeSizeof(i->sType) == 8)
handleShift(i);
break;
case OP_SET:
case OP_SET_AND:
case OP_SET_OR:
case OP_SET_XOR:
if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
handleSET(i->asCmp());
break;
default:
break;
}
@ -612,7 +677,7 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
} else {
// TODO: Move this to before register allocation for operations that
// need the $c register !
if (typeSizeof(i->dType) == 8) {
if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
Instruction *hi;
hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
if (hi)

View file

@ -35,7 +35,9 @@ private:
void handleDIV(Instruction *); // integer division, modulus
void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
void handleFTZ(Instruction *);
void handleSET(CmpInstruction *);
void handleTEXLOD(TexInstruction *);
void handleShift(Instruction *);
protected:
BuildUtil bld;

View file

@ -1054,8 +1054,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
}
break;
case OP_ADD:
case OP_SUB:
if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
!isFloatType(i->dType))
break;
/* fallthrough */
case OP_ADD:
if (i->usesFlags())
break;
if (imm0.isInteger(0)) {

View file

@ -392,7 +392,8 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
emit->emitInstruction(i);
info->bin.instructions++;
if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
if ((typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) &&
(isFloatType(i->sType) || isFloatType(i->dType)))
info->io.fp64 = true;
}
}

View file

@ -329,6 +329,10 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
// indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
if (ld->src(0).isIndirect(0))
return false;
// these are implemented using shf.r and shf.l which can't load consts
if ((i->op == OP_SHL || i->op == OP_SHR) && typeSizeof(i->sType) == 8 &&
sf == FILE_MEMORY_CONST)
return false;
for (int k = 0; i->srcExists(k); ++k) {
if (i->src(k).getFile() == FILE_IMMEDIATE) {
@ -340,7 +344,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
return false;
} else
if (i->src(k).getFile() != FILE_GPR &&
i->src(k).getFile() != FILE_PREDICATE) {
i->src(k).getFile() != FILE_PREDICATE &&
i->src(k).getFile() != FILE_FLAGS) {
return false;
}
}