mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 00:00:11 +01:00
nvc0/ir: initial implementation of nve4 scheduling hints
This commit is contained in:
parent
00fe442253
commit
afcd7b5d16
8 changed files with 738 additions and 15 deletions
|
|
@ -681,6 +681,8 @@ public:
|
|||
|
||||
uint8_t subOp; // quadop, 1 for mul-high, etc.
|
||||
|
||||
uint8_t sched; // scheduling data (NOTE: maybe move to separate storage)
|
||||
|
||||
unsigned encSize : 4; // encoding size in bytes
|
||||
unsigned saturate : 1; // to [0.0f, 1.0f]
|
||||
unsigned join : 1; // converge control flow (use OP_JOIN until end)
|
||||
|
|
|
|||
|
|
@ -53,6 +53,26 @@ static const char *colour[8] =
|
|||
#endif
|
||||
};
|
||||
|
||||
static const char *OpClassStr[OPCLASS_OTHER + 1] =
|
||||
{
|
||||
"MOVE",
|
||||
"LOAD",
|
||||
"STORE",
|
||||
"ARITH",
|
||||
"SHIFT",
|
||||
"SFU",
|
||||
"LOGIC",
|
||||
"COMPARE",
|
||||
"CONVERT",
|
||||
"ATOMIC",
|
||||
"TEXTURE",
|
||||
"SURFACE",
|
||||
"FLOW",
|
||||
"(INVALID)",
|
||||
"PSEUDO",
|
||||
"OTHER"
|
||||
};
|
||||
|
||||
const char *operationStr[OP_LAST + 1] =
|
||||
{
|
||||
"nop",
|
||||
|
|
|
|||
|
|
@ -52,6 +52,65 @@ const uint8_t Target::operationSrcNr[OP_LAST + 1] =
|
|||
0
|
||||
};
|
||||
|
||||
const OpClass Target::operationClass[OP_LAST + 1] =
|
||||
{
|
||||
// NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT
|
||||
OPCLASS_OTHER,
|
||||
OPCLASS_PSEUDO,
|
||||
OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO,
|
||||
// MOV; LOAD; STORE
|
||||
OPCLASS_MOVE,
|
||||
OPCLASS_LOAD,
|
||||
OPCLASS_STORE,
|
||||
// ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD
|
||||
OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
|
||||
OPCLASS_ARITH, OPCLASS_ARITH,
|
||||
OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
|
||||
// ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
|
||||
OPCLASS_CONVERT, OPCLASS_CONVERT,
|
||||
OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
|
||||
OPCLASS_SHIFT, OPCLASS_SHIFT,
|
||||
// MAX, MIN
|
||||
OPCLASS_COMPARE, OPCLASS_COMPARE,
|
||||
// SAT, CEIL, FLOOR, TRUNC; CVT
|
||||
OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT,
|
||||
OPCLASS_CONVERT,
|
||||
// SET(AND,OR,XOR); SELP, SLCT
|
||||
OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE,
|
||||
OPCLASS_COMPARE, OPCLASS_COMPARE,
|
||||
// RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW
|
||||
OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
|
||||
OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
|
||||
OPCLASS_SFU, OPCLASS_SFU,
|
||||
// BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN
|
||||
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
|
||||
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
|
||||
OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
|
||||
// DISCARD, EXIT
|
||||
OPCLASS_FLOW, OPCLASS_FLOW,
|
||||
// MEMBAR
|
||||
OPCLASS_OTHER,
|
||||
// VFETCH, PFETCH, EXPORT
|
||||
OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
|
||||
// LINTERP, PINTERP
|
||||
OPCLASS_SFU, OPCLASS_SFU,
|
||||
// EMIT, RESTART
|
||||
OPCLASS_OTHER, OPCLASS_OTHER,
|
||||
// TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA
|
||||
OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
|
||||
OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
|
||||
// SULD, SUST
|
||||
OPCLASS_SURFACE, OPCLASS_SURFACE,
|
||||
// DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP
|
||||
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
|
||||
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
|
||||
// POPCNT, INSBF, EXTBF
|
||||
OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
|
||||
// TEXBAR
|
||||
OPCLASS_OTHER,
|
||||
OPCLASS_PSEUDO // LAST
|
||||
};
|
||||
|
||||
|
||||
extern Target *getTargetNVC0(unsigned int chipset);
|
||||
extern Target *getTargetNV50(unsigned int chipset);
|
||||
|
|
@ -104,6 +163,11 @@ CodeEmitter::printBinary() const
|
|||
INFO("\n");
|
||||
}
|
||||
|
||||
static inline uint32_t sizeToBundlesNVE4(uint32_t size)
|
||||
{
|
||||
return (size + 55) / 56;
|
||||
}
|
||||
|
||||
void
|
||||
CodeEmitter::prepareEmission(Program *prog)
|
||||
{
|
||||
|
|
@ -112,6 +176,23 @@ CodeEmitter::prepareEmission(Program *prog)
|
|||
Function *func = reinterpret_cast<Function *>(fi.get());
|
||||
func->binPos = prog->binSize;
|
||||
prepareEmission(func);
|
||||
|
||||
// adjust sizes & positions for schedulding info:
|
||||
if (prog->getTarget()->hasSWSched) {
|
||||
BasicBlock *bb = NULL;
|
||||
for (int i = 0; i < func->bbCount; ++i) {
|
||||
bb = func->bbArray[i];
|
||||
const uint32_t oldPos = bb->binPos;
|
||||
const uint32_t oldEnd = bb->binPos + bb->binSize;
|
||||
uint32_t adjPos = oldPos + sizeToBundlesNVE4(oldPos) * 8;
|
||||
uint32_t adjEnd = oldEnd + sizeToBundlesNVE4(oldEnd) * 8;
|
||||
bb->binPos = adjPos;
|
||||
bb->binSize = adjEnd - adjPos;
|
||||
}
|
||||
if (bb)
|
||||
func->binSize = bb->binPos + bb->binSize;
|
||||
}
|
||||
|
||||
prog->binSize += func->binSize;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,9 +93,31 @@ protected:
|
|||
RelocInfo *relocInfo;
|
||||
};
|
||||
|
||||
|
||||
enum OpClass
|
||||
{
|
||||
OPCLASS_MOVE = 0,
|
||||
OPCLASS_LOAD = 1,
|
||||
OPCLASS_STORE = 2,
|
||||
OPCLASS_ARITH = 3,
|
||||
OPCLASS_SHIFT = 4,
|
||||
OPCLASS_SFU = 5,
|
||||
OPCLASS_LOGIC = 6,
|
||||
OPCLASS_COMPARE = 7,
|
||||
OPCLASS_CONVERT = 8,
|
||||
OPCLASS_ATOMIC = 9,
|
||||
OPCLASS_TEXTURE = 10,
|
||||
OPCLASS_SURFACE = 11,
|
||||
OPCLASS_FLOW = 12,
|
||||
OPCLASS_PSEUDO = 14,
|
||||
OPCLASS_OTHER = 15
|
||||
};
|
||||
|
||||
class Target
|
||||
{
|
||||
public:
|
||||
Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { }
|
||||
|
||||
static Target *create(uint32_t chipset);
|
||||
static void destroy(Target *);
|
||||
|
||||
|
|
@ -153,6 +175,9 @@ public:
|
|||
virtual bool mayPredicate(const Instruction *,
|
||||
const Value *) const = 0;
|
||||
|
||||
// whether @insn can be issued together with @next (order matters)
|
||||
virtual bool canDualIssue(const Instruction *insn,
|
||||
const Instruction *next) const { return false; }
|
||||
virtual int getLatency(const Instruction *) const { return 1; }
|
||||
virtual int getThroughput(const Instruction *) const { return 1; }
|
||||
|
||||
|
|
@ -162,9 +187,20 @@ public:
|
|||
virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0;
|
||||
|
||||
public:
|
||||
bool joinAnterior; // true if join is executed before the op
|
||||
const bool joinAnterior; // true if join is executed before the op
|
||||
const bool hasSWSched; // true if code should provide scheduling data
|
||||
|
||||
static const uint8_t operationSrcNr[OP_LAST + 1];
|
||||
static const OpClass operationClass[OP_LAST + 1];
|
||||
|
||||
static inline uint8_t getOpSrcNr(operation op)
|
||||
{
|
||||
return operationSrcNr[op];
|
||||
}
|
||||
static inline OpClass getOpClass(operation op)
|
||||
{
|
||||
return operationClass[op];
|
||||
}
|
||||
|
||||
protected:
|
||||
uint32_t chipset;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ Target *getTargetNV50(unsigned int chipset)
|
|||
return new TargetNV50(chipset);
|
||||
}
|
||||
|
||||
TargetNV50::TargetNV50(unsigned int card)
|
||||
TargetNV50::TargetNV50(unsigned int card) : Target(true, false)
|
||||
{
|
||||
chipset = card;
|
||||
|
||||
|
|
@ -132,8 +132,6 @@ void TargetNV50::initOpInfo()
|
|||
OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
|
||||
};
|
||||
|
||||
joinAnterior = true;
|
||||
|
||||
for (i = 0; i < DATA_FILE_COUNT; ++i)
|
||||
nativeFileMap[i] = (DataFile)i;
|
||||
nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ public:
|
|||
|
||||
virtual bool emitInstruction(Instruction *);
|
||||
virtual uint32_t getMinEncodingSize(const Instruction *) const;
|
||||
virtual void prepareEmission(Function *);
|
||||
|
||||
inline void setProgramType(Program::Type pType) { progType = pType; }
|
||||
|
||||
|
|
@ -41,6 +42,8 @@ private:
|
|||
|
||||
Program::Type progType;
|
||||
|
||||
const bool writeIssueDelays;
|
||||
|
||||
private:
|
||||
void emitForm_A(const Instruction *, uint64_t);
|
||||
void emitForm_B(const Instruction *, uint64_t);
|
||||
|
|
@ -1505,15 +1508,40 @@ CodeEmitterNVC0::emitMOV(const Instruction *i)
|
|||
bool
|
||||
CodeEmitterNVC0::emitInstruction(Instruction *insn)
|
||||
{
|
||||
unsigned int size = insn->encSize;
|
||||
|
||||
if (writeIssueDelays && !(codeSize & 0x3f))
|
||||
size += 8;
|
||||
|
||||
if (!insn->encSize) {
|
||||
ERROR("skipping unencodable instruction: "); insn->print();
|
||||
return false;
|
||||
} else
|
||||
if (codeSize + insn->encSize > codeSizeLimit) {
|
||||
if (codeSize + size > codeSizeLimit) {
|
||||
ERROR("code emitter output buffer too small\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (writeIssueDelays) {
|
||||
if (!(codeSize & 0x3f)) {
|
||||
code[0] = 0x00000007; // cf issue delay "instruction"
|
||||
code[1] = 0x20000000;
|
||||
code += 2;
|
||||
codeSize += 8;
|
||||
}
|
||||
const unsigned int id = (codeSize & 0x3f) / 8 - 1;
|
||||
uint32_t *data = code - (id * 2 + 2);
|
||||
if (id <= 2) {
|
||||
data[0] |= insn->sched << (id * 8 + 4);
|
||||
} else
|
||||
if (id == 3) {
|
||||
data[0] |= insn->sched << 28;
|
||||
data[1] |= insn->sched >> 4;
|
||||
} else {
|
||||
data[1] |= insn->sched << ((id - 4) * 8 + 4);
|
||||
}
|
||||
}
|
||||
|
||||
// assert that instructions with multiple defs don't corrupt registers
|
||||
for (int d = 0; insn->defExists(d); ++d)
|
||||
assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
|
||||
|
|
@ -1707,7 +1735,7 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
|
|||
{
|
||||
const Target::OpInfo &info = targ->getOpInfo(i);
|
||||
|
||||
if (info.minEncSize == 8 || 1)
|
||||
if (writeIssueDelays || info.minEncSize == 8 || 1)
|
||||
return 8;
|
||||
|
||||
if (i->ftz || i->saturate || i->join)
|
||||
|
|
@ -1761,7 +1789,503 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
|
|||
return 4;
|
||||
}
|
||||
|
||||
CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : CodeEmitter(target)
|
||||
// Simplified, erring on safe side.
|
||||
class SchedDataCalculator : public Pass
|
||||
{
|
||||
public:
|
||||
SchedDataCalculator(const Target *targ) : targ(targ) { }
|
||||
|
||||
private:
|
||||
struct RegScores
|
||||
{
|
||||
struct Resource {
|
||||
int st[DATA_FILE_COUNT]; // LD to LD delay 3
|
||||
int ld[DATA_FILE_COUNT]; // ST to ST delay 3
|
||||
int tex; // TEX to non-TEX delay 17 (0x11)
|
||||
int sfu; // SFU to SFU delay 3 (except PRE-ops)
|
||||
int imul; // integer MUL to MUL delay 3
|
||||
} res;
|
||||
struct ScoreData {
|
||||
int r[64];
|
||||
int p[8];
|
||||
int c;
|
||||
} rd, wr;
|
||||
int base;
|
||||
|
||||
void rebase(const int base)
|
||||
{
|
||||
const int delta = this->base - base;
|
||||
if (!delta)
|
||||
return;
|
||||
this->base = 0;
|
||||
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
rd.r[i] += delta;
|
||||
wr.r[i] += delta;
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
rd.p[i] += delta;
|
||||
wr.p[i] += delta;
|
||||
}
|
||||
rd.c += delta;
|
||||
wr.c += delta;
|
||||
|
||||
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
|
||||
res.ld[f] += delta;
|
||||
res.st[f] += delta;
|
||||
}
|
||||
res.sfu += delta;
|
||||
res.imul += delta;
|
||||
res.tex += delta;
|
||||
}
|
||||
void wipe()
|
||||
{
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
memset(&wr, 0, sizeof(wr));
|
||||
memset(&res, 0, sizeof(res));
|
||||
}
|
||||
int getLatest(const ScoreData& d) const
|
||||
{
|
||||
int max = 0;
|
||||
for (int i = 0; i < 64; ++i)
|
||||
if (d.r[i] > max)
|
||||
max = d.r[i];
|
||||
for (int i = 0; i < 8; ++i)
|
||||
if (d.p[i] > max)
|
||||
max = d.p[i];
|
||||
if (d.c > max)
|
||||
max = d.c;
|
||||
return max;
|
||||
}
|
||||
inline int getLatestRd() const
|
||||
{
|
||||
return getLatest(rd);
|
||||
}
|
||||
inline int getLatestWr() const
|
||||
{
|
||||
return getLatest(wr);
|
||||
}
|
||||
inline int getLatest() const
|
||||
{
|
||||
const int a = getLatestRd();
|
||||
const int b = getLatestWr();
|
||||
|
||||
int max = MAX2(a, b);
|
||||
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
|
||||
max = MAX2(res.ld[f], max);
|
||||
max = MAX2(res.st[f], max);
|
||||
}
|
||||
max = MAX2(res.sfu, max);
|
||||
max = MAX2(res.imul, max);
|
||||
max = MAX2(res.tex, max);
|
||||
return max;
|
||||
}
|
||||
void setMax(const RegScores *that)
|
||||
{
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
|
||||
wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
|
||||
wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
|
||||
}
|
||||
rd.c = MAX2(rd.c, that->rd.c);
|
||||
wr.c = MAX2(wr.c, that->wr.c);
|
||||
|
||||
for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
|
||||
res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
|
||||
res.st[f] = MAX2(res.st[f], that->res.st[f]);
|
||||
}
|
||||
res.sfu = MAX2(res.sfu, that->res.sfu);
|
||||
res.imul = MAX2(res.imul, that->res.imul);
|
||||
res.tex = MAX2(res.tex, that->res.tex);
|
||||
}
|
||||
void print(int cycle)
|
||||
{
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
if (rd.r[i] > cycle)
|
||||
INFO("rd $r%i @ %i\n", i, rd.r[i]);
|
||||
if (wr.r[i] > cycle)
|
||||
INFO("wr $r%i @ %i\n", i, wr.r[i]);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (rd.p[i] > cycle)
|
||||
INFO("rd $p%i @ %i\n", i, rd.p[i]);
|
||||
if (wr.p[i] > cycle)
|
||||
INFO("wr $p%i @ %i\n", i, wr.p[i]);
|
||||
}
|
||||
if (rd.c > cycle)
|
||||
INFO("rd $c @ %i\n", rd.c);
|
||||
if (wr.c > cycle)
|
||||
INFO("wr $c @ %i\n", wr.c);
|
||||
if (res.sfu > cycle)
|
||||
INFO("sfu @ %i\n", res.sfu);
|
||||
if (res.imul > cycle)
|
||||
INFO("imul @ %i\n", res.imul);
|
||||
if (res.tex > cycle)
|
||||
INFO("tex @ %i\n", res.tex);
|
||||
}
|
||||
};
|
||||
|
||||
RegScores *score; // for current BB
|
||||
std::vector<RegScores> scoreBoards;
|
||||
int cycle;
|
||||
int prevData;
|
||||
operation prevOp;
|
||||
|
||||
const Target *targ;
|
||||
|
||||
bool visit(Function *);
|
||||
bool visit(BasicBlock *);
|
||||
|
||||
void commitInsn(const Instruction *, int cycle);
|
||||
int calcDelay(const Instruction *, int cycle) const;
|
||||
void setDelay(Instruction *, int delay, Instruction *next);
|
||||
|
||||
void recordRd(const Value *, const int ready);
|
||||
void recordWr(const Value *, const int ready);
|
||||
void checkRd(const Value *, int cycle, int& delay) const;
|
||||
void checkWr(const Value *, int cycle, int& delay) const;
|
||||
|
||||
int getCycles(const Instruction *, int origDelay) const;
|
||||
};
|
||||
|
||||
void
|
||||
SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
|
||||
{
|
||||
if (insn->op == OP_EXIT)
|
||||
delay = MAX2(delay, 14);
|
||||
|
||||
if (insn->op == OP_TEXBAR) {
|
||||
// TODO: except if results not used before EXIT
|
||||
insn->sched = 0xc2;
|
||||
} else
|
||||
if (insn->op == OP_JOIN || insn->join) {
|
||||
insn->sched = 0x00;
|
||||
} else
|
||||
if (delay >= 0 || prevData == 0x04 ||
|
||||
!next || !targ->canDualIssue(insn, next)) {
|
||||
insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
|
||||
if (prevOp == OP_EXPORT)
|
||||
insn->sched |= 0x40;
|
||||
else
|
||||
insn->sched |= 0x20;
|
||||
} else {
|
||||
insn->sched = 0x04; // dual-issue
|
||||
}
|
||||
|
||||
if (prevData != 0x04 || prevOp != OP_EXPORT)
|
||||
if (insn->sched != 0x04 || insn->op == OP_EXPORT)
|
||||
prevOp = insn->op;
|
||||
|
||||
prevData = insn->sched;
|
||||
}
|
||||
|
||||
int
|
||||
SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
|
||||
{
|
||||
if (insn->sched & 0x80) {
|
||||
int c = (insn->sched & 0x0f) * 2 + 1;
|
||||
if (insn->op == OP_TEXBAR && origDelay > 0)
|
||||
c += origDelay;
|
||||
return c;
|
||||
}
|
||||
if (insn->sched & 0x60)
|
||||
return (insn->sched & 0x1f) + 1;
|
||||
return (insn->sched == 0x04) ? 0 : 32;
|
||||
}
|
||||
|
||||
bool
|
||||
SchedDataCalculator::visit(Function *func)
|
||||
{
|
||||
scoreBoards.resize(func->cfg.getSize());
|
||||
for (size_t i = 0; i < scoreBoards.size(); ++i)
|
||||
scoreBoards[i].wipe();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
SchedDataCalculator::visit(BasicBlock *bb)
|
||||
{
|
||||
Instruction *insn;
|
||||
Instruction *next = NULL;
|
||||
|
||||
int cycle = 0;
|
||||
|
||||
prevData = 0x00;
|
||||
prevOp = OP_NOP;
|
||||
score = &scoreBoards.at(bb->getId());
|
||||
|
||||
for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
|
||||
BasicBlock *in = BasicBlock::get(ei.getNode());
|
||||
if (in->getExit()) {
|
||||
if (prevData != 0x04)
|
||||
prevData = in->getExit()->sched;
|
||||
prevOp = in->getExit()->op;
|
||||
}
|
||||
if (ei.getType() != Graph::Edge::BACK)
|
||||
score->setMax(&scoreBoards.at(in->getId()));
|
||||
// back branches will wait until all target dependencies are satisfied
|
||||
}
|
||||
if (bb->cfg.incidentCount() > 1)
|
||||
prevOp = OP_NOP;
|
||||
|
||||
#ifdef NVC0_DEBUG_SCHED_DATA
|
||||
INFO("=== BB:%i initial scores\n", bb->getId());
|
||||
score->print(cycle);
|
||||
#endif
|
||||
|
||||
for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
|
||||
next = insn->next;
|
||||
|
||||
commitInsn(insn, cycle);
|
||||
int delay = calcDelay(next, cycle);
|
||||
setDelay(insn, delay, next);
|
||||
cycle += getCycles(insn, delay);
|
||||
|
||||
#ifdef NVC0_DEBUG_SCHED_DATA
|
||||
INFO("cycle %i, sched %02x\n", cycle, insn->sched);
|
||||
insn->print();
|
||||
next->print();
|
||||
#endif
|
||||
}
|
||||
if (!insn)
|
||||
return true;
|
||||
commitInsn(insn, cycle);
|
||||
|
||||
int bbDelay = -1;
|
||||
|
||||
for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
|
||||
BasicBlock *out = BasicBlock::get(ei.getNode());
|
||||
|
||||
if (ei.getType() != Graph::Edge::BACK) {
|
||||
// only test the first instruction of the outgoing block
|
||||
next = out->getEntry();
|
||||
if (next)
|
||||
bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
|
||||
} else {
|
||||
// wait until all dependencies are satisfied
|
||||
const int regsFree = score->getLatest();
|
||||
next = out->getFirst();
|
||||
for (int c = cycle; next && c < regsFree; next = next->next) {
|
||||
bbDelay = MAX2(bbDelay, calcDelay(next, c));
|
||||
c += getCycles(next, bbDelay);
|
||||
}
|
||||
next = NULL;
|
||||
}
|
||||
}
|
||||
if (bb->cfg.outgoingCount() != 1)
|
||||
next = NULL;
|
||||
setDelay(insn, bbDelay, next);
|
||||
cycle += getCycles(insn, bbDelay);
|
||||
|
||||
score->rebase(cycle); // common base for initializing out blocks' scores
|
||||
return true;
|
||||
}
|
||||
|
||||
#define NVE4_MAX_ISSUE_DELAY 0x1f
|
||||
int
|
||||
SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
|
||||
{
|
||||
int delay = 0, ready = cycle;
|
||||
|
||||
for (int s = 0; insn->srcExists(s); ++s)
|
||||
checkRd(insn->getSrc(s), cycle, delay);
|
||||
// WAR & WAW don't seem to matter
|
||||
// for (int s = 0; insn->srcExists(s); ++s)
|
||||
// recordRd(insn->getSrc(s), cycle);
|
||||
|
||||
switch (Target::getOpClass(insn->op)) {
|
||||
case OPCLASS_SFU:
|
||||
ready = score->res.sfu;
|
||||
break;
|
||||
case OPCLASS_ARITH:
|
||||
if (insn->op == OP_MUL && !isFloatType(insn->dType))
|
||||
ready = score->res.imul;
|
||||
break;
|
||||
case OPCLASS_TEXTURE:
|
||||
ready = score->res.tex;
|
||||
break;
|
||||
case OPCLASS_LOAD:
|
||||
ready = score->res.ld[insn->src(0).getFile()];
|
||||
break;
|
||||
case OPCLASS_STORE:
|
||||
ready = score->res.st[insn->src(0).getFile()];
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
|
||||
ready = MAX2(ready, score->res.tex);
|
||||
|
||||
delay = MAX2(delay, ready - cycle);
|
||||
|
||||
// if can issue next cycle, delay is 0, not 1
|
||||
return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
|
||||
}
|
||||
|
||||
void
|
||||
SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
|
||||
{
|
||||
const int ready = cycle + targ->getLatency(insn);
|
||||
|
||||
for (int d = 0; insn->defExists(d); ++d)
|
||||
recordWr(insn->getDef(d), ready);
|
||||
// WAR & WAW don't seem to matter
|
||||
// for (int s = 0; insn->srcExists(s); ++s)
|
||||
// recordRd(insn->getSrc(s), cycle);
|
||||
|
||||
switch (Target::getOpClass(insn->op)) {
|
||||
case OPCLASS_SFU:
|
||||
score->res.sfu = cycle + 4;
|
||||
break;
|
||||
case OPCLASS_ARITH:
|
||||
if (insn->op == OP_MUL && !isFloatType(insn->dType))
|
||||
score->res.imul = cycle + 4;
|
||||
break;
|
||||
case OPCLASS_TEXTURE:
|
||||
score->res.tex = cycle + 18;
|
||||
break;
|
||||
case OPCLASS_LOAD:
|
||||
if (insn->src(0).getFile() == FILE_MEMORY_CONST)
|
||||
break;
|
||||
score->res.ld[insn->src(0).getFile()] = cycle + 4;
|
||||
score->res.st[insn->src(0).getFile()] = ready;
|
||||
break;
|
||||
case OPCLASS_STORE:
|
||||
score->res.st[insn->src(0).getFile()] = cycle + 4;
|
||||
score->res.ld[insn->src(0).getFile()] = ready;
|
||||
break;
|
||||
case OPCLASS_OTHER:
|
||||
if (insn->op == OP_TEXBAR)
|
||||
score->res.tex = cycle;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef NVC0_DEBUG_SCHED_DATA
|
||||
score->print(cycle);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
|
||||
{
|
||||
int ready = cycle;
|
||||
int a, b;
|
||||
|
||||
switch (v->reg.file) {
|
||||
case FILE_GPR:
|
||||
a = v->reg.data.id;
|
||||
b = a + v->reg.size / 4;
|
||||
for (int r = a; r < b; ++r)
|
||||
ready = MAX2(ready, score->rd.r[r]);
|
||||
break;
|
||||
case FILE_PREDICATE:
|
||||
ready = MAX2(ready, score->rd.p[v->reg.data.id]);
|
||||
break;
|
||||
case FILE_FLAGS:
|
||||
ready = MAX2(ready, score->rd.c);
|
||||
break;
|
||||
case FILE_SHADER_INPUT:
|
||||
case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
|
||||
case FILE_MEMORY_LOCAL:
|
||||
case FILE_MEMORY_CONST:
|
||||
case FILE_MEMORY_SHARED:
|
||||
case FILE_MEMORY_GLOBAL:
|
||||
case FILE_SYSTEM_VALUE:
|
||||
// TODO: any restrictions here ?
|
||||
break;
|
||||
case FILE_IMMEDIATE:
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
if (cycle < ready)
|
||||
delay = MAX2(delay, ready - cycle);
|
||||
}
|
||||
|
||||
void
|
||||
SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
|
||||
{
|
||||
int ready = cycle;
|
||||
int a, b;
|
||||
|
||||
switch (v->reg.file) {
|
||||
case FILE_GPR:
|
||||
a = v->reg.data.id;
|
||||
b = a + v->reg.size / 4;
|
||||
for (int r = a; r < b; ++r)
|
||||
ready = MAX2(ready, score->wr.r[r]);
|
||||
break;
|
||||
case FILE_PREDICATE:
|
||||
ready = MAX2(ready, score->wr.p[v->reg.data.id]);
|
||||
break;
|
||||
default:
|
||||
assert(v->reg.file == FILE_FLAGS);
|
||||
ready = MAX2(ready, score->wr.c);
|
||||
break;
|
||||
}
|
||||
if (cycle < ready)
|
||||
delay = MAX2(delay, ready - cycle);
|
||||
}
|
||||
|
||||
void
|
||||
SchedDataCalculator::recordWr(const Value *v, const int ready)
|
||||
{
|
||||
int a = v->reg.data.id;
|
||||
|
||||
if (v->reg.file == FILE_GPR) {
|
||||
int b = a + v->reg.size / 4;
|
||||
for (int r = a; r < b; ++r)
|
||||
score->rd.r[r] = ready;
|
||||
} else
|
||||
// $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
|
||||
if (v->reg.file == FILE_PREDICATE) {
|
||||
score->rd.p[a] = ready + 4;
|
||||
} else {
|
||||
assert(v->reg.file == FILE_FLAGS);
|
||||
score->rd.c = ready + 4;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
SchedDataCalculator::recordRd(const Value *v, const int ready)
|
||||
{
|
||||
int a = v->reg.data.id;
|
||||
|
||||
if (v->reg.file == FILE_GPR) {
|
||||
int b = a + v->reg.size / 4;
|
||||
for (int r = a; r < b; ++r)
|
||||
score->wr.r[r] = ready;
|
||||
} else
|
||||
if (v->reg.file == FILE_PREDICATE) {
|
||||
score->wr.p[a] = ready;
|
||||
} else
|
||||
if (v->reg.file == FILE_FLAGS) {
|
||||
score->wr.c = ready;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
CodeEmitterNVC0::prepareEmission(Function *func)
|
||||
{
|
||||
const Target *targ = func->getProgram()->getTarget();
|
||||
|
||||
CodeEmitter::prepareEmission(func);
|
||||
|
||||
if (targ->hasSWSched) {
|
||||
SchedDataCalculator sched(targ);
|
||||
sched.run(func, true, true);
|
||||
}
|
||||
}
|
||||
|
||||
CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
|
||||
: CodeEmitter(target),
|
||||
writeIssueDelays(target->hasSWSched)
|
||||
{
|
||||
code = NULL;
|
||||
codeSize = codeSizeLimit = 0;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ Target *getTargetNVC0(unsigned int chipset)
|
|||
return new TargetNVC0(chipset);
|
||||
}
|
||||
|
||||
TargetNVC0::TargetNVC0(unsigned int card)
|
||||
TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
|
||||
{
|
||||
chipset = card;
|
||||
initOpInfo();
|
||||
|
|
@ -274,8 +274,6 @@ void TargetNVC0::initOpInfo()
|
|||
OP_QUADON, OP_QUADPOP, OP_TEXBAR
|
||||
};
|
||||
|
||||
joinAnterior = false;
|
||||
|
||||
for (i = 0; i < DATA_FILE_COUNT; ++i)
|
||||
nativeFileMap[i] = (DataFile)i;
|
||||
nativeFileMap[FILE_ADDRESS] = FILE_GPR;
|
||||
|
|
@ -534,14 +532,39 @@ TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
|
|||
}
|
||||
|
||||
// TODO: better values
|
||||
// this could be more precise, e.g. depending on the issue-to-read/write delay
|
||||
// of the depending instruction, but it's good enough
|
||||
int TargetNVC0::getLatency(const Instruction *i) const
|
||||
{
|
||||
if (i->op == OP_LOAD) {
|
||||
if (i->cache == CACHE_CV)
|
||||
return 700;
|
||||
return 48;
|
||||
if (chipset >= 0xe4) {
|
||||
if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
|
||||
return 20;
|
||||
switch (i->op) {
|
||||
case OP_LINTERP:
|
||||
case OP_PINTERP:
|
||||
return 15;
|
||||
case OP_LOAD:
|
||||
if (i->src(0).getFile() == FILE_MEMORY_CONST)
|
||||
return 9;
|
||||
// fall through
|
||||
case OP_VFETCH:
|
||||
return 24;
|
||||
default:
|
||||
if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
|
||||
return 17;
|
||||
if (i->op == OP_MUL && i->dType != TYPE_F32)
|
||||
return 15;
|
||||
return 9;
|
||||
}
|
||||
} else {
|
||||
if (i->op == OP_LOAD) {
|
||||
if (i->cache == CACHE_CV)
|
||||
return 700;
|
||||
return 48;
|
||||
}
|
||||
return 24;
|
||||
}
|
||||
return 24;
|
||||
return 32;
|
||||
}
|
||||
|
||||
// These are "inverse" throughput values, i.e. the number of cycles required
|
||||
|
|
@ -613,4 +636,42 @@ int TargetNVC0::getThroughput(const Instruction *i) const
|
|||
}
|
||||
}
|
||||
|
||||
bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
|
||||
{
|
||||
const OpClass clA = operationClass[a->op];
|
||||
const OpClass clB = operationClass[b->op];
|
||||
|
||||
if (getChipset() >= 0xe4) {
|
||||
// not texturing
|
||||
// not if the 2nd instruction isn't necessarily executed
|
||||
if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
|
||||
return false;
|
||||
// anything with MOV
|
||||
if (a->op == OP_MOV || b->op == OP_MOV)
|
||||
return true;
|
||||
if (clA == clB) {
|
||||
// only F32 arith or integer additions
|
||||
if (clA != OPCLASS_ARITH)
|
||||
return false;
|
||||
return (a->dType == TYPE_F32 || a->op == OP_ADD ||
|
||||
b->dType == TYPE_F32 || b->op == OP_ADD);
|
||||
}
|
||||
// nothing with TEXBAR
|
||||
if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
|
||||
return false;
|
||||
// no loads and stores accessing the the same space
|
||||
if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
|
||||
(clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
|
||||
if (a->src(0).getFile() == b->src(0).getFile())
|
||||
return false;
|
||||
// no > 32-bit ops
|
||||
if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
|
||||
typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
|
||||
return false;
|
||||
return true;
|
||||
} else {
|
||||
return false; // info not needed (yet)
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace nv50_ir
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ public:
|
|||
virtual bool isPostMultiplySupported(operation, float, int& e) const;
|
||||
virtual bool mayPredicate(const Instruction *, const Value *) const;
|
||||
|
||||
virtual bool canDualIssue(const Instruction *, const Instruction *) const;
|
||||
virtual int getLatency(const Instruction *) const;
|
||||
virtual int getThroughput(const Instruction *) const;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue