nvc0/ir: initial implementation of nve4 scheduling hints

This commit is contained in:
Christoph Bumiller 2012-04-29 17:59:06 +02:00
parent 00fe442253
commit afcd7b5d16
8 changed files with 738 additions and 15 deletions

View file

@ -681,6 +681,8 @@ public:
uint8_t subOp; // quadop, 1 for mul-high, etc.
uint8_t sched; // scheduling data (NOTE: maybe move to separate storage)
unsigned encSize : 4; // encoding size in bytes
unsigned saturate : 1; // to [0.0f, 1.0f]
unsigned join : 1; // converge control flow (use OP_JOIN until end)

View file

@ -53,6 +53,26 @@ static const char *colour[8] =
#endif
};
static const char *OpClassStr[OPCLASS_OTHER + 1] =
{
"MOVE",
"LOAD",
"STORE",
"ARITH",
"SHIFT",
"SFU",
"LOGIC",
"COMPARE",
"CONVERT",
"ATOMIC",
"TEXTURE",
"SURFACE",
"FLOW",
"(INVALID)",
"PSEUDO",
"OTHER"
};
const char *operationStr[OP_LAST + 1] =
{
"nop",

View file

@ -52,6 +52,65 @@ const uint8_t Target::operationSrcNr[OP_LAST + 1] =
0
};
const OpClass Target::operationClass[OP_LAST + 1] =
{
// NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT
OPCLASS_OTHER,
OPCLASS_PSEUDO,
OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO,
// MOV; LOAD; STORE
OPCLASS_MOVE,
OPCLASS_LOAD,
OPCLASS_STORE,
// ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD
OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
OPCLASS_ARITH, OPCLASS_ARITH,
OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
// ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
OPCLASS_CONVERT, OPCLASS_CONVERT,
OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
OPCLASS_SHIFT, OPCLASS_SHIFT,
// MAX, MIN
OPCLASS_COMPARE, OPCLASS_COMPARE,
// SAT, CEIL, FLOOR, TRUNC; CVT
OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT,
OPCLASS_CONVERT,
// SET(AND,OR,XOR); SELP, SLCT
OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE,
OPCLASS_COMPARE, OPCLASS_COMPARE,
// RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW
OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
OPCLASS_SFU, OPCLASS_SFU,
// BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
// DISCARD, EXIT
OPCLASS_FLOW, OPCLASS_FLOW,
// MEMBAR
OPCLASS_OTHER,
// VFETCH, PFETCH, EXPORT
OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
// LINTERP, PINTERP
OPCLASS_SFU, OPCLASS_SFU,
// EMIT, RESTART
OPCLASS_OTHER, OPCLASS_OTHER,
// TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA
OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
// SULD, SUST
OPCLASS_SURFACE, OPCLASS_SURFACE,
// DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
// POPCNT, INSBF, EXTBF
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
// TEXBAR
OPCLASS_OTHER,
OPCLASS_PSEUDO // LAST
};
extern Target *getTargetNVC0(unsigned int chipset);
extern Target *getTargetNV50(unsigned int chipset);
@ -104,6 +163,11 @@ CodeEmitter::printBinary() const
INFO("\n");
}
static inline uint32_t sizeToBundlesNVE4(uint32_t size)
{
return (size + 55) / 56;
}
void
CodeEmitter::prepareEmission(Program *prog)
{
@ -112,6 +176,23 @@ CodeEmitter::prepareEmission(Program *prog)
Function *func = reinterpret_cast<Function *>(fi.get());
func->binPos = prog->binSize;
prepareEmission(func);
// adjust sizes & positions for schedulding info:
if (prog->getTarget()->hasSWSched) {
BasicBlock *bb = NULL;
for (int i = 0; i < func->bbCount; ++i) {
bb = func->bbArray[i];
const uint32_t oldPos = bb->binPos;
const uint32_t oldEnd = bb->binPos + bb->binSize;
uint32_t adjPos = oldPos + sizeToBundlesNVE4(oldPos) * 8;
uint32_t adjEnd = oldEnd + sizeToBundlesNVE4(oldEnd) * 8;
bb->binPos = adjPos;
bb->binSize = adjEnd - adjPos;
}
if (bb)
func->binSize = bb->binPos + bb->binSize;
}
prog->binSize += func->binSize;
}
}

View file

@ -93,9 +93,31 @@ protected:
RelocInfo *relocInfo;
};
enum OpClass
{
OPCLASS_MOVE = 0,
OPCLASS_LOAD = 1,
OPCLASS_STORE = 2,
OPCLASS_ARITH = 3,
OPCLASS_SHIFT = 4,
OPCLASS_SFU = 5,
OPCLASS_LOGIC = 6,
OPCLASS_COMPARE = 7,
OPCLASS_CONVERT = 8,
OPCLASS_ATOMIC = 9,
OPCLASS_TEXTURE = 10,
OPCLASS_SURFACE = 11,
OPCLASS_FLOW = 12,
OPCLASS_PSEUDO = 14,
OPCLASS_OTHER = 15
};
class Target
{
public:
Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { }
static Target *create(uint32_t chipset);
static void destroy(Target *);
@ -153,6 +175,9 @@ public:
virtual bool mayPredicate(const Instruction *,
const Value *) const = 0;
// whether @insn can be issued together with @next (order matters)
virtual bool canDualIssue(const Instruction *insn,
const Instruction *next) const { return false; }
virtual int getLatency(const Instruction *) const { return 1; }
virtual int getThroughput(const Instruction *) const { return 1; }
@ -162,9 +187,20 @@ public:
virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0;
public:
bool joinAnterior; // true if join is executed before the op
const bool joinAnterior; // true if join is executed before the op
const bool hasSWSched; // true if code should provide scheduling data
static const uint8_t operationSrcNr[OP_LAST + 1];
static const OpClass operationClass[OP_LAST + 1];
static inline uint8_t getOpSrcNr(operation op)
{
return operationSrcNr[op];
}
static inline OpClass getOpClass(operation op)
{
return operationClass[op];
}
protected:
uint32_t chipset;

View file

@ -29,7 +29,7 @@ Target *getTargetNV50(unsigned int chipset)
return new TargetNV50(chipset);
}
TargetNV50::TargetNV50(unsigned int card)
TargetNV50::TargetNV50(unsigned int card) : Target(true, false)
{
chipset = card;
@ -132,8 +132,6 @@ void TargetNV50::initOpInfo()
OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
};
joinAnterior = true;
for (i = 0; i < DATA_FILE_COUNT; ++i)
nativeFileMap[i] = (DataFile)i;
nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;

View file

@ -33,6 +33,7 @@ public:
virtual bool emitInstruction(Instruction *);
virtual uint32_t getMinEncodingSize(const Instruction *) const;
virtual void prepareEmission(Function *);
inline void setProgramType(Program::Type pType) { progType = pType; }
@ -41,6 +42,8 @@ private:
Program::Type progType;
const bool writeIssueDelays;
private:
void emitForm_A(const Instruction *, uint64_t);
void emitForm_B(const Instruction *, uint64_t);
@ -1505,15 +1508,40 @@ CodeEmitterNVC0::emitMOV(const Instruction *i)
bool
CodeEmitterNVC0::emitInstruction(Instruction *insn)
{
unsigned int size = insn->encSize;
if (writeIssueDelays && !(codeSize & 0x3f))
size += 8;
if (!insn->encSize) {
ERROR("skipping unencodable instruction: "); insn->print();
return false;
} else
if (codeSize + insn->encSize > codeSizeLimit) {
if (codeSize + size > codeSizeLimit) {
ERROR("code emitter output buffer too small\n");
return false;
}
if (writeIssueDelays) {
if (!(codeSize & 0x3f)) {
code[0] = 0x00000007; // cf issue delay "instruction"
code[1] = 0x20000000;
code += 2;
codeSize += 8;
}
const unsigned int id = (codeSize & 0x3f) / 8 - 1;
uint32_t *data = code - (id * 2 + 2);
if (id <= 2) {
data[0] |= insn->sched << (id * 8 + 4);
} else
if (id == 3) {
data[0] |= insn->sched << 28;
data[1] |= insn->sched >> 4;
} else {
data[1] |= insn->sched << ((id - 4) * 8 + 4);
}
}
// assert that instructions with multiple defs don't corrupt registers
for (int d = 0; insn->defExists(d); ++d)
assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
@ -1707,7 +1735,7 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
{
const Target::OpInfo &info = targ->getOpInfo(i);
if (info.minEncSize == 8 || 1)
if (writeIssueDelays || info.minEncSize == 8 || 1)
return 8;
if (i->ftz || i->saturate || i->join)
@ -1761,7 +1789,503 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
return 4;
}
CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : CodeEmitter(target)
// Simplified, erring on safe side.
class SchedDataCalculator : public Pass
{
public:
SchedDataCalculator(const Target *targ) : targ(targ) { }
private:
struct RegScores
{
struct Resource {
int st[DATA_FILE_COUNT]; // LD to LD delay 3
int ld[DATA_FILE_COUNT]; // ST to ST delay 3
int tex; // TEX to non-TEX delay 17 (0x11)
int sfu; // SFU to SFU delay 3 (except PRE-ops)
int imul; // integer MUL to MUL delay 3
} res;
struct ScoreData {
int r[64];
int p[8];
int c;
} rd, wr;
int base;
void rebase(const int base)
{
const int delta = this->base - base;
if (!delta)
return;
this->base = 0;
for (int i = 0; i < 64; ++i) {
rd.r[i] += delta;
wr.r[i] += delta;
}
for (int i = 0; i < 8; ++i) {
rd.p[i] += delta;
wr.p[i] += delta;
}
rd.c += delta;
wr.c += delta;
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
res.ld[f] += delta;
res.st[f] += delta;
}
res.sfu += delta;
res.imul += delta;
res.tex += delta;
}
void wipe()
{
memset(&rd, 0, sizeof(rd));
memset(&wr, 0, sizeof(wr));
memset(&res, 0, sizeof(res));
}
int getLatest(const ScoreData& d) const
{
int max = 0;
for (int i = 0; i < 64; ++i)
if (d.r[i] > max)
max = d.r[i];
for (int i = 0; i < 8; ++i)
if (d.p[i] > max)
max = d.p[i];
if (d.c > max)
max = d.c;
return max;
}
inline int getLatestRd() const
{
return getLatest(rd);
}
inline int getLatestWr() const
{
return getLatest(wr);
}
inline int getLatest() const
{
const int a = getLatestRd();
const int b = getLatestWr();
int max = MAX2(a, b);
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
max = MAX2(res.ld[f], max);
max = MAX2(res.st[f], max);
}
max = MAX2(res.sfu, max);
max = MAX2(res.imul, max);
max = MAX2(res.tex, max);
return max;
}
void setMax(const RegScores *that)
{
for (int i = 0; i < 64; ++i) {
rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
}
for (int i = 0; i < 8; ++i) {
rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
}
rd.c = MAX2(rd.c, that->rd.c);
wr.c = MAX2(wr.c, that->wr.c);
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
res.st[f] = MAX2(res.st[f], that->res.st[f]);
}
res.sfu = MAX2(res.sfu, that->res.sfu);
res.imul = MAX2(res.imul, that->res.imul);
res.tex = MAX2(res.tex, that->res.tex);
}
void print(int cycle)
{
for (int i = 0; i < 64; ++i) {
if (rd.r[i] > cycle)
INFO("rd $r%i @ %i\n", i, rd.r[i]);
if (wr.r[i] > cycle)
INFO("wr $r%i @ %i\n", i, wr.r[i]);
}
for (int i = 0; i < 8; ++i) {
if (rd.p[i] > cycle)
INFO("rd $p%i @ %i\n", i, rd.p[i]);
if (wr.p[i] > cycle)
INFO("wr $p%i @ %i\n", i, wr.p[i]);
}
if (rd.c > cycle)
INFO("rd $c @ %i\n", rd.c);
if (wr.c > cycle)
INFO("wr $c @ %i\n", wr.c);
if (res.sfu > cycle)
INFO("sfu @ %i\n", res.sfu);
if (res.imul > cycle)
INFO("imul @ %i\n", res.imul);
if (res.tex > cycle)
INFO("tex @ %i\n", res.tex);
}
};
RegScores *score; // for current BB
std::vector<RegScores> scoreBoards;
int cycle;
int prevData;
operation prevOp;
const Target *targ;
bool visit(Function *);
bool visit(BasicBlock *);
void commitInsn(const Instruction *, int cycle);
int calcDelay(const Instruction *, int cycle) const;
void setDelay(Instruction *, int delay, Instruction *next);
void recordRd(const Value *, const int ready);
void recordWr(const Value *, const int ready);
void checkRd(const Value *, int cycle, int& delay) const;
void checkWr(const Value *, int cycle, int& delay) const;
int getCycles(const Instruction *, int origDelay) const;
};
void
SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
{
if (insn->op == OP_EXIT)
delay = MAX2(delay, 14);
if (insn->op == OP_TEXBAR) {
// TODO: except if results not used before EXIT
insn->sched = 0xc2;
} else
if (insn->op == OP_JOIN || insn->join) {
insn->sched = 0x00;
} else
if (delay >= 0 || prevData == 0x04 ||
!next || !targ->canDualIssue(insn, next)) {
insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
if (prevOp == OP_EXPORT)
insn->sched |= 0x40;
else
insn->sched |= 0x20;
} else {
insn->sched = 0x04; // dual-issue
}
if (prevData != 0x04 || prevOp != OP_EXPORT)
if (insn->sched != 0x04 || insn->op == OP_EXPORT)
prevOp = insn->op;
prevData = insn->sched;
}
int
SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
{
if (insn->sched & 0x80) {
int c = (insn->sched & 0x0f) * 2 + 1;
if (insn->op == OP_TEXBAR && origDelay > 0)
c += origDelay;
return c;
}
if (insn->sched & 0x60)
return (insn->sched & 0x1f) + 1;
return (insn->sched == 0x04) ? 0 : 32;
}
bool
SchedDataCalculator::visit(Function *func)
{
scoreBoards.resize(func->cfg.getSize());
for (size_t i = 0; i < scoreBoards.size(); ++i)
scoreBoards[i].wipe();
return true;
}
bool
SchedDataCalculator::visit(BasicBlock *bb)
{
Instruction *insn;
Instruction *next = NULL;
int cycle = 0;
prevData = 0x00;
prevOp = OP_NOP;
score = &scoreBoards.at(bb->getId());
for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
BasicBlock *in = BasicBlock::get(ei.getNode());
if (in->getExit()) {
if (prevData != 0x04)
prevData = in->getExit()->sched;
prevOp = in->getExit()->op;
}
if (ei.getType() != Graph::Edge::BACK)
score->setMax(&scoreBoards.at(in->getId()));
// back branches will wait until all target dependencies are satisfied
}
if (bb->cfg.incidentCount() > 1)
prevOp = OP_NOP;
#ifdef NVC0_DEBUG_SCHED_DATA
INFO("=== BB:%i initial scores\n", bb->getId());
score->print(cycle);
#endif
for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
next = insn->next;
commitInsn(insn, cycle);
int delay = calcDelay(next, cycle);
setDelay(insn, delay, next);
cycle += getCycles(insn, delay);
#ifdef NVC0_DEBUG_SCHED_DATA
INFO("cycle %i, sched %02x\n", cycle, insn->sched);
insn->print();
next->print();
#endif
}
if (!insn)
return true;
commitInsn(insn, cycle);
int bbDelay = -1;
for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
BasicBlock *out = BasicBlock::get(ei.getNode());
if (ei.getType() != Graph::Edge::BACK) {
// only test the first instruction of the outgoing block
next = out->getEntry();
if (next)
bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
} else {
// wait until all dependencies are satisfied
const int regsFree = score->getLatest();
next = out->getFirst();
for (int c = cycle; next && c < regsFree; next = next->next) {
bbDelay = MAX2(bbDelay, calcDelay(next, c));
c += getCycles(next, bbDelay);
}
next = NULL;
}
}
if (bb->cfg.outgoingCount() != 1)
next = NULL;
setDelay(insn, bbDelay, next);
cycle += getCycles(insn, bbDelay);
score->rebase(cycle); // common base for initializing out blocks' scores
return true;
}
#define NVE4_MAX_ISSUE_DELAY 0x1f
int
SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
{
int delay = 0, ready = cycle;
for (int s = 0; insn->srcExists(s); ++s)
checkRd(insn->getSrc(s), cycle, delay);
// WAR & WAW don't seem to matter
// for (int s = 0; insn->srcExists(s); ++s)
// recordRd(insn->getSrc(s), cycle);
switch (Target::getOpClass(insn->op)) {
case OPCLASS_SFU:
ready = score->res.sfu;
break;
case OPCLASS_ARITH:
if (insn->op == OP_MUL && !isFloatType(insn->dType))
ready = score->res.imul;
break;
case OPCLASS_TEXTURE:
ready = score->res.tex;
break;
case OPCLASS_LOAD:
ready = score->res.ld[insn->src(0).getFile()];
break;
case OPCLASS_STORE:
ready = score->res.st[insn->src(0).getFile()];
break;
default:
break;
}
if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
ready = MAX2(ready, score->res.tex);
delay = MAX2(delay, ready - cycle);
// if can issue next cycle, delay is 0, not 1
return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
}
void
SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
{
const int ready = cycle + targ->getLatency(insn);
for (int d = 0; insn->defExists(d); ++d)
recordWr(insn->getDef(d), ready);
// WAR & WAW don't seem to matter
// for (int s = 0; insn->srcExists(s); ++s)
// recordRd(insn->getSrc(s), cycle);
switch (Target::getOpClass(insn->op)) {
case OPCLASS_SFU:
score->res.sfu = cycle + 4;
break;
case OPCLASS_ARITH:
if (insn->op == OP_MUL && !isFloatType(insn->dType))
score->res.imul = cycle + 4;
break;
case OPCLASS_TEXTURE:
score->res.tex = cycle + 18;
break;
case OPCLASS_LOAD:
if (insn->src(0).getFile() == FILE_MEMORY_CONST)
break;
score->res.ld[insn->src(0).getFile()] = cycle + 4;
score->res.st[insn->src(0).getFile()] = ready;
break;
case OPCLASS_STORE:
score->res.st[insn->src(0).getFile()] = cycle + 4;
score->res.ld[insn->src(0).getFile()] = ready;
break;
case OPCLASS_OTHER:
if (insn->op == OP_TEXBAR)
score->res.tex = cycle;
break;
default:
break;
}
#ifdef NVC0_DEBUG_SCHED_DATA
score->print(cycle);
#endif
}
void
SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
{
int ready = cycle;
int a, b;
switch (v->reg.file) {
case FILE_GPR:
a = v->reg.data.id;
b = a + v->reg.size / 4;
for (int r = a; r < b; ++r)
ready = MAX2(ready, score->rd.r[r]);
break;
case FILE_PREDICATE:
ready = MAX2(ready, score->rd.p[v->reg.data.id]);
break;
case FILE_FLAGS:
ready = MAX2(ready, score->rd.c);
break;
case FILE_SHADER_INPUT:
case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
case FILE_MEMORY_LOCAL:
case FILE_MEMORY_CONST:
case FILE_MEMORY_SHARED:
case FILE_MEMORY_GLOBAL:
case FILE_SYSTEM_VALUE:
// TODO: any restrictions here ?
break;
case FILE_IMMEDIATE:
break;
default:
assert(0);
break;
}
if (cycle < ready)
delay = MAX2(delay, ready - cycle);
}
void
SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
{
int ready = cycle;
int a, b;
switch (v->reg.file) {
case FILE_GPR:
a = v->reg.data.id;
b = a + v->reg.size / 4;
for (int r = a; r < b; ++r)
ready = MAX2(ready, score->wr.r[r]);
break;
case FILE_PREDICATE:
ready = MAX2(ready, score->wr.p[v->reg.data.id]);
break;
default:
assert(v->reg.file == FILE_FLAGS);
ready = MAX2(ready, score->wr.c);
break;
}
if (cycle < ready)
delay = MAX2(delay, ready - cycle);
}
void
SchedDataCalculator::recordWr(const Value *v, const int ready)
{
int a = v->reg.data.id;
if (v->reg.file == FILE_GPR) {
int b = a + v->reg.size / 4;
for (int r = a; r < b; ++r)
score->rd.r[r] = ready;
} else
// $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
if (v->reg.file == FILE_PREDICATE) {
score->rd.p[a] = ready + 4;
} else {
assert(v->reg.file == FILE_FLAGS);
score->rd.c = ready + 4;
}
}
void
SchedDataCalculator::recordRd(const Value *v, const int ready)
{
int a = v->reg.data.id;
if (v->reg.file == FILE_GPR) {
int b = a + v->reg.size / 4;
for (int r = a; r < b; ++r)
score->wr.r[r] = ready;
} else
if (v->reg.file == FILE_PREDICATE) {
score->wr.p[a] = ready;
} else
if (v->reg.file == FILE_FLAGS) {
score->wr.c = ready;
}
}
void
CodeEmitterNVC0::prepareEmission(Function *func)
{
const Target *targ = func->getProgram()->getTarget();
CodeEmitter::prepareEmission(func);
if (targ->hasSWSched) {
SchedDataCalculator sched(targ);
sched.run(func, true, true);
}
}
CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
: CodeEmitter(target),
writeIssueDelays(target->hasSWSched)
{
code = NULL;
codeSize = codeSizeLimit = 0;

View file

@ -29,7 +29,7 @@ Target *getTargetNVC0(unsigned int chipset)
return new TargetNVC0(chipset);
}
TargetNVC0::TargetNVC0(unsigned int card)
TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
{
chipset = card;
initOpInfo();
@ -274,8 +274,6 @@ void TargetNVC0::initOpInfo()
OP_QUADON, OP_QUADPOP, OP_TEXBAR
};
joinAnterior = false;
for (i = 0; i < DATA_FILE_COUNT; ++i)
nativeFileMap[i] = (DataFile)i;
nativeFileMap[FILE_ADDRESS] = FILE_GPR;
@ -534,14 +532,39 @@ TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
}
// TODO: better values
// this could be more precise, e.g. depending on the issue-to-read/write delay
// of the depending instruction, but it's good enough
int TargetNVC0::getLatency(const Instruction *i) const
{
if (i->op == OP_LOAD) {
if (i->cache == CACHE_CV)
return 700;
return 48;
if (chipset >= 0xe4) {
if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
return 20;
switch (i->op) {
case OP_LINTERP:
case OP_PINTERP:
return 15;
case OP_LOAD:
if (i->src(0).getFile() == FILE_MEMORY_CONST)
return 9;
// fall through
case OP_VFETCH:
return 24;
default:
if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
return 17;
if (i->op == OP_MUL && i->dType != TYPE_F32)
return 15;
return 9;
}
} else {
if (i->op == OP_LOAD) {
if (i->cache == CACHE_CV)
return 700;
return 48;
}
return 24;
}
return 24;
return 32;
}
// These are "inverse" throughput values, i.e. the number of cycles required
@ -613,4 +636,42 @@ int TargetNVC0::getThroughput(const Instruction *i) const
}
}
bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
{
const OpClass clA = operationClass[a->op];
const OpClass clB = operationClass[b->op];
if (getChipset() >= 0xe4) {
// not texturing
// not if the 2nd instruction isn't necessarily executed
if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
return false;
// anything with MOV
if (a->op == OP_MOV || b->op == OP_MOV)
return true;
if (clA == clB) {
// only F32 arith or integer additions
if (clA != OPCLASS_ARITH)
return false;
return (a->dType == TYPE_F32 || a->op == OP_ADD ||
b->dType == TYPE_F32 || b->op == OP_ADD);
}
// nothing with TEXBAR
if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
return false;
// no loads and stores accessing the the same space
if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
(clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
if (a->src(0).getFile() == b->src(0).getFile())
return false;
// no > 32-bit ops
if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
return false;
return true;
} else {
return false; // info not needed (yet)
}
}
} // namespace nv50_ir

View file

@ -51,6 +51,7 @@ public:
virtual bool isPostMultiplySupported(operation, float, int& e) const;
virtual bool mayPredicate(const Instruction *, const Value *) const;
virtual bool canDualIssue(const Instruction *, const Instruction *) const;
virtual int getLatency(const Instruction *) const;
virtual int getThroughput(const Instruction *) const;