diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index c096f50428b..07e04b83892 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -51,11 +51,6 @@ struct ir3_postsched_ctx { struct dag *dag; struct list_head unscheduled_list; /* unscheduled instructions */ - - unsigned ip; - - int ss_delay; - int sy_delay; }; struct ir3_postsched_node { @@ -63,13 +58,23 @@ struct ir3_postsched_node { struct ir3_instruction *instr; bool partially_evaluated_path; - unsigned earliest_ip; + /* The number of nops that need to be inserted if this instruction were + * scheduled now. This is recalculated for all DAG heads whenever a new + * instruction needs to be selected based on the current legalize state. + */ + unsigned delay; bool has_sy_src, has_ss_src; unsigned max_delay; }; +struct ir3_postsched_block_data { + struct ir3_legalize_state legalize_state; + unsigned sy_delay; + unsigned ss_delay; +}; + #define foreach_sched_node(__n, __list) \ list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link) @@ -77,14 +82,14 @@ static bool has_sy_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; - return node->has_sy_src; + return !!(node->instr->flags & IR3_INSTR_SY); } static bool has_ss_src(struct ir3_instruction *instr) { struct ir3_postsched_node *node = instr->data; - return node->has_ss_src; + return !!(node->instr->flags & IR3_INSTR_SS); } #ifndef NDEBUG @@ -108,66 +113,55 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) di(instr, "schedule"); - bool counts_for_delay = is_alu(instr) || is_flow(instr); - - unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0; - struct ir3_postsched_node *n = instr->data; - /* We insert any nop's needed to get to earliest_ip, then advance - * delay_cycles by scheduling the instruction. - */ - ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles; - - util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { - unsigned delay = (unsigned)(uintptr_t)edge->data; - struct ir3_postsched_node *child = - container_of(edge->child, struct ir3_postsched_node, dag); - child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay); - } - list_addtail(&instr->node, &instr->block->instr_list); dag_prune_head(ctx->dag, &n->dag); + struct ir3_postsched_block_data *bd = ctx->block->data; + bd->legalize_state.cycle += n->delay; + ir3_update_legalize_state(&bd->legalize_state, ctx->v->compiler, instr); + if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) return; if (is_ss_producer(instr)) { - ctx->ss_delay = soft_ss_delay(instr); + bd->ss_delay = soft_ss_delay(instr); } else if (has_ss_src(instr)) { - ctx->ss_delay = 0; - } else if (ctx->ss_delay > 0) { - ctx->ss_delay--; + bd->ss_delay = 0; + } else if (bd->ss_delay > 0) { + bd->ss_delay--; } if (is_sy_producer(instr)) { - ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader); + bd->sy_delay = soft_sy_delay(instr, ctx->block->shader); } else if (has_sy_src(instr)) { - ctx->sy_delay = 0; - } else if (ctx->sy_delay > 0) { - ctx->sy_delay--; + bd->sy_delay = 0; + } else if (bd->sy_delay > 0) { + bd->sy_delay--; } } static unsigned node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) { - return MAX2(n->earliest_ip, ctx->ip) - ctx->ip; + return n->delay; } static unsigned node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n) { unsigned delay = node_delay(ctx, n); + struct ir3_postsched_block_data *bd = n->instr->block->data; /* This takes into account that as when we schedule multiple tex or sfu, the * first user has to wait for all of them to complete. */ - if (n->has_ss_src) - delay = MAX2(delay, ctx->ss_delay); - if (n->has_sy_src) - delay = MAX2(delay, ctx->sy_delay); + if (has_ss_src(n->instr)) + delay = MAX2(delay, bd->ss_delay); + if (has_sy_src(n->instr)) + delay = MAX2(delay, bd->sy_delay); return delay; } @@ -208,6 +202,20 @@ choose_instr(struct ir3_postsched_ctx *ctx) { struct ir3_postsched_node *chosen = NULL; + struct ir3_postsched_block_data *bd = ctx->block->data; + + /* Needed sync flags and nop delays potentially change after scheduling an + * instruction. Update them for all schedulable instructions. + */ + foreach_sched_node (n, &ctx->dag->heads) { + enum ir3_instruction_flags sync_flags = ir3_required_sync_flags( + &bd->legalize_state, ctx->v->compiler, n->instr); + n->instr->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); + n->instr->flags |= sync_flags; + n->delay = + ir3_required_delay(&bd->legalize_state, ctx->v->compiler, n->instr); + } + dump_state(ctx); foreach_sched_node (n, &ctx->dag->heads) { @@ -576,8 +584,6 @@ sched_dag_max_delay_cb(struct dag_node *node, void *state) static void sched_dag_init(struct ir3_postsched_ctx *ctx) { - ctx->mem_ctx = ralloc_context(NULL); - ctx->dag = dag_create(ctx->mem_ctx); foreach_instr (instr, &ctx->unscheduled_list) @@ -656,17 +662,44 @@ sched_dag_init(struct ir3_postsched_ctx *ctx) static void sched_dag_destroy(struct ir3_postsched_ctx *ctx) { - ralloc_free(ctx->mem_ctx); - ctx->mem_ctx = NULL; ctx->dag = NULL; } +static struct ir3_legalize_state * +get_block_legalize_state(struct ir3_block *block) +{ + struct ir3_postsched_block_data *bd = block->data; + return bd ? &bd->legalize_state : NULL; +} + static void sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) { ctx->block = block; - ctx->sy_delay = 0; - ctx->ss_delay = 0; + struct ir3_postsched_block_data *bd = + rzalloc(ctx->mem_ctx, struct ir3_postsched_block_data); + block->data = bd; + + ir3_init_legalize_state(&bd->legalize_state, ctx->v->compiler); + ir3_merge_pred_legalize_states(&bd->legalize_state, block, + get_block_legalize_state); + + /* Initialize the ss/sy_delay by taking the maximum from the predecessors. + * TODO: disable carrying over tex prefetch delays from the preamble for now + * as this seems to negatively affect nop count and stalls. This should be + * revisited in the future. + */ + if (block != ir3_after_preamble(ctx->ir)) { + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ir3_postsched_block_data *pred_bd = pred->data; + + if (pred_bd) { + bd->sy_delay = MAX2(bd->sy_delay, pred_bd->sy_delay); + bd->ss_delay = MAX2(bd->ss_delay, pred_bd->ss_delay); + } + } + } /* The terminator has to stay at the end. Instead of trying to set up * dependencies to achieve this, it's easier to just remove it now and add it @@ -786,13 +819,19 @@ ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v) struct ir3_postsched_ctx ctx = { .ir = ir, .v = v, + .mem_ctx = ralloc_context(NULL), }; cleanup_self_movs(ir); + foreach_block (block, &ir->block_list) { + block->data = NULL; + } + foreach_block (block, &ir->block_list) { sched_block(&ctx, block); } + ralloc_free(ctx.mem_ctx); return true; }