freedreno/ir3: intra-block scheduling

Because of loops, we can't schedule all of a block's predecessors first.
Instead just assume that the result consumed in a block was written far
enough away in all paths into a block.  And do an intra-block scheduling
pass to figure out if there are any cases where we need to insert extra
nop's.  This works out better than always assuming the worst case (ie.
that a value live into a block was written in the last instruction in
the predecessor block).

Signed-off-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
Rob Clark 2018-02-05 08:45:29 -05:00
parent 2a2099a875
commit c57ed8e01c

View file

@ -136,29 +136,73 @@ deepest(struct ir3_instruction **srcs, unsigned nsrcs)
return d;
}
/**
* @block: the block to search in, starting from end; in first pass,
* this will be the block the instruction would be inserted into
* (but has not yet, ie. it only contains already scheduled
* instructions). For intra-block scheduling (second pass), this
* would be one of the predecessor blocks.
* @instr: the instruction to search for
* @maxd: max distance, bail after searching this # of instruction
* slots, since it means the instruction we are looking for is
* far enough away
* @pred: if true, recursively search into predecessor blocks to
* find the worst case (shortest) distance (only possible after
* individual blocks are all scheduled
*/
static unsigned
distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
unsigned maxd)
distance(struct ir3_block *block, struct ir3_instruction *instr,
unsigned maxd, bool pred)
{
struct list_head *instr_list = &ctx->block->instr_list;
unsigned d = 0;
list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
if ((n == instr) || (d >= maxd))
break;
if (is_alu(n) || is_flow(n))
return d;
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
d++;
}
/* if coming from a predecessor block, assume it is assigned far
* enough away.. we'll fix up later.
*/
if (!pred)
return maxd;
if (pred && (block->data != block)) {
/* Search into predecessor blocks, finding the one with the
* shortest distance, since that will be the worst case
*/
unsigned min = maxd - d;
/* (ab)use block->data to prevent recursion: */
block->data = block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
unsigned n;
n = distance(block->predecessors[i], instr, min, pred);
min = MIN2(min, n);
}
block->data = NULL;
d += min;
}
return d;
}
/* calculate delay for specified src: */
static unsigned
delay_calc_srcn(struct ir3_sched_ctx *ctx,
delay_calc_srcn(struct ir3_block *block,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
unsigned srcn, bool soft)
unsigned srcn, bool soft, bool pred)
{
unsigned delay = 0;
@ -166,9 +210,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
struct ir3_instruction *src;
foreach_ssa_src(src, assigner) {
unsigned d;
if (src->block != assigner->block)
break;
d = delay_calc_srcn(ctx, src, consumer, srcn, soft);
d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
delay = MAX2(delay, d);
}
} else {
@ -181,7 +223,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
} else {
delay = ir3_delayslots(assigner, consumer, srcn);
}
delay -= distance(ctx, assigner, delay);
delay -= distance(block, assigner, delay, pred);
}
return delay;
@ -189,19 +231,15 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
/* calculate delay for instruction (maximum of delay for all srcs): */
static unsigned
delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft)
delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
bool soft, bool pred)
{
unsigned delay = 0;
struct ir3_instruction *src;
foreach_ssa_src_n(src, i, instr) {
unsigned d;
/* for array writes, no need to delay on previous write: */
if (i == 0)
continue;
if (src->block != instr->block)
continue;
d = delay_calc_srcn(ctx, src, instr, i, soft);
d = delay_calc_srcn(block, src, instr, i, soft, pred);
delay = MAX2(delay, d);
}
@ -396,7 +434,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
if (!candidate)
continue;
delay = delay_calc(ctx, candidate, soft);
delay = delay_calc(ctx->block, candidate, soft, false);
if (delay < min_delay) {
best_instr = candidate;
min_delay = delay;
@ -537,7 +575,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
instr = find_eligible_instr(ctx, &notes, false);
if (instr) {
unsigned delay = delay_calc(ctx, instr, false);
unsigned delay = delay_calc(ctx->block, instr, false, false);
/* and if we run out of instructions that can be scheduled,
* then it is time for nop's:
@ -594,7 +632,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
debug_assert(ctx->pred);
debug_assert(block->condition);
delay -= distance(ctx, ctx->pred, delay);
delay -= distance(ctx->block, ctx->pred, delay, false);
while (delay > 0) {
ir3_NOP(block);
@ -633,14 +671,58 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
*/
}
/* After scheduling individual blocks, we still could have cases where
* one (or more) paths into a block, a value produced by a previous
* has too few delay slots to be legal. We can't deal with this in the
* first pass, because loops (ie. we can't ensure all predecessor blocks
* are already scheduled in the first pass). All we can really do at
* this point is stuff in extra nop's until things are legal.
*/
static void
sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
unsigned n = 0;
ctx->block = block;
list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
unsigned delay = 0;
for (unsigned i = 0; i < block->predecessors_count; i++) {
unsigned d = delay_calc(block->predecessors[i], instr, false, true);
delay = MAX2(d, delay);
}
while (delay > n) {
struct ir3_instruction *nop = ir3_NOP(block);
/* move to before instr: */
list_delinit(&nop->node);
list_addtail(&nop->node, &instr->node);
n++;
}
/* we can bail once we hit worst case delay: */
if (++n > 6)
break;
}
}
int ir3_sched(struct ir3 *ir)
{
struct ir3_sched_ctx ctx = {0};
ir3_clear_mark(ir);
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
sched_block(&ctx, block);
}
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
sched_intra_block(&ctx, block);
}
if (ctx.error)
return -1;
return 0;