mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-01 06:50:22 +01:00
ir3/postsched: use legalize state for delay/sync calc
Switch to using the newly available ir3_legalize_state API in ir3_postsched. This has a few advantages: - Prevents duplication of delay/sync logic. ir3_postsched is currently missing a lot of the complexities implemented in ir3_legalize. Reusing the logic makes sure ir3_postsched is kept up to date with these complexities. - Allows ir3_postsched to have a global view (i.e., across blocks) on delay and sync state. Currently, all information is cleared at the start of blocks which makes us underestimate required delays. - Allows ir3_postsched to have a more accurate view on required sync flags. We currently calculate requirement once based on whether an instruction's sources come from a ss/sy-producer. This does not take into account whether sources have already been synced. Now we can do this. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34108>
This commit is contained in:
parent
ca014c7c24
commit
53de95cb0d
1 changed files with 82 additions and 43 deletions
|
|
@ -51,11 +51,6 @@ struct ir3_postsched_ctx {
|
|||
struct dag *dag;
|
||||
|
||||
struct list_head unscheduled_list; /* unscheduled instructions */
|
||||
|
||||
unsigned ip;
|
||||
|
||||
int ss_delay;
|
||||
int sy_delay;
|
||||
};
|
||||
|
||||
struct ir3_postsched_node {
|
||||
|
|
@ -63,13 +58,23 @@ struct ir3_postsched_node {
|
|||
struct ir3_instruction *instr;
|
||||
bool partially_evaluated_path;
|
||||
|
||||
unsigned earliest_ip;
|
||||
/* The number of nops that need to be inserted if this instruction were
|
||||
* scheduled now. This is recalculated for all DAG heads whenever a new
|
||||
* instruction needs to be selected based on the current legalize state.
|
||||
*/
|
||||
unsigned delay;
|
||||
|
||||
bool has_sy_src, has_ss_src;
|
||||
|
||||
unsigned max_delay;
|
||||
};
|
||||
|
||||
struct ir3_postsched_block_data {
|
||||
struct ir3_legalize_state legalize_state;
|
||||
unsigned sy_delay;
|
||||
unsigned ss_delay;
|
||||
};
|
||||
|
||||
#define foreach_sched_node(__n, __list) \
|
||||
list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
|
||||
|
||||
|
|
@ -77,14 +82,14 @@ static bool
|
|||
has_sy_src(struct ir3_instruction *instr)
|
||||
{
|
||||
struct ir3_postsched_node *node = instr->data;
|
||||
return node->has_sy_src;
|
||||
return !!(node->instr->flags & IR3_INSTR_SY);
|
||||
}
|
||||
|
||||
static bool
|
||||
has_ss_src(struct ir3_instruction *instr)
|
||||
{
|
||||
struct ir3_postsched_node *node = instr->data;
|
||||
return node->has_ss_src;
|
||||
return !!(node->instr->flags & IR3_INSTR_SS);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
@ -108,66 +113,55 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
|
|||
|
||||
di(instr, "schedule");
|
||||
|
||||
bool counts_for_delay = is_alu(instr) || is_flow(instr);
|
||||
|
||||
unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
|
||||
|
||||
struct ir3_postsched_node *n = instr->data;
|
||||
|
||||
/* We insert any nop's needed to get to earliest_ip, then advance
|
||||
* delay_cycles by scheduling the instruction.
|
||||
*/
|
||||
ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
|
||||
|
||||
util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
|
||||
unsigned delay = (unsigned)(uintptr_t)edge->data;
|
||||
struct ir3_postsched_node *child =
|
||||
container_of(edge->child, struct ir3_postsched_node, dag);
|
||||
child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
|
||||
}
|
||||
|
||||
list_addtail(&instr->node, &instr->block->instr_list);
|
||||
|
||||
dag_prune_head(ctx->dag, &n->dag);
|
||||
|
||||
struct ir3_postsched_block_data *bd = ctx->block->data;
|
||||
bd->legalize_state.cycle += n->delay;
|
||||
ir3_update_legalize_state(&bd->legalize_state, ctx->v->compiler, instr);
|
||||
|
||||
if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
|
||||
return;
|
||||
|
||||
if (is_ss_producer(instr)) {
|
||||
ctx->ss_delay = soft_ss_delay(instr);
|
||||
bd->ss_delay = soft_ss_delay(instr);
|
||||
} else if (has_ss_src(instr)) {
|
||||
ctx->ss_delay = 0;
|
||||
} else if (ctx->ss_delay > 0) {
|
||||
ctx->ss_delay--;
|
||||
bd->ss_delay = 0;
|
||||
} else if (bd->ss_delay > 0) {
|
||||
bd->ss_delay--;
|
||||
}
|
||||
|
||||
if (is_sy_producer(instr)) {
|
||||
ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
|
||||
bd->sy_delay = soft_sy_delay(instr, ctx->block->shader);
|
||||
} else if (has_sy_src(instr)) {
|
||||
ctx->sy_delay = 0;
|
||||
} else if (ctx->sy_delay > 0) {
|
||||
ctx->sy_delay--;
|
||||
bd->sy_delay = 0;
|
||||
} else if (bd->sy_delay > 0) {
|
||||
bd->sy_delay--;
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned
|
||||
node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
|
||||
{
|
||||
return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
|
||||
return n->delay;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
|
||||
{
|
||||
unsigned delay = node_delay(ctx, n);
|
||||
struct ir3_postsched_block_data *bd = n->instr->block->data;
|
||||
|
||||
/* This takes into account that as when we schedule multiple tex or sfu, the
|
||||
* first user has to wait for all of them to complete.
|
||||
*/
|
||||
if (n->has_ss_src)
|
||||
delay = MAX2(delay, ctx->ss_delay);
|
||||
if (n->has_sy_src)
|
||||
delay = MAX2(delay, ctx->sy_delay);
|
||||
if (has_ss_src(n->instr))
|
||||
delay = MAX2(delay, bd->ss_delay);
|
||||
if (has_sy_src(n->instr))
|
||||
delay = MAX2(delay, bd->sy_delay);
|
||||
|
||||
return delay;
|
||||
}
|
||||
|
|
@ -208,6 +202,20 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
{
|
||||
struct ir3_postsched_node *chosen = NULL;
|
||||
|
||||
struct ir3_postsched_block_data *bd = ctx->block->data;
|
||||
|
||||
/* Needed sync flags and nop delays potentially change after scheduling an
|
||||
* instruction. Update them for all schedulable instructions.
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
enum ir3_instruction_flags sync_flags = ir3_required_sync_flags(
|
||||
&bd->legalize_state, ctx->v->compiler, n->instr);
|
||||
n->instr->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
|
||||
n->instr->flags |= sync_flags;
|
||||
n->delay =
|
||||
ir3_required_delay(&bd->legalize_state, ctx->v->compiler, n->instr);
|
||||
}
|
||||
|
||||
dump_state(ctx);
|
||||
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
|
|
@ -576,8 +584,6 @@ sched_dag_max_delay_cb(struct dag_node *node, void *state)
|
|||
static void
|
||||
sched_dag_init(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
ctx->mem_ctx = ralloc_context(NULL);
|
||||
|
||||
ctx->dag = dag_create(ctx->mem_ctx);
|
||||
|
||||
foreach_instr (instr, &ctx->unscheduled_list)
|
||||
|
|
@ -656,17 +662,44 @@ sched_dag_init(struct ir3_postsched_ctx *ctx)
|
|||
static void
|
||||
sched_dag_destroy(struct ir3_postsched_ctx *ctx)
|
||||
{
|
||||
ralloc_free(ctx->mem_ctx);
|
||||
ctx->mem_ctx = NULL;
|
||||
ctx->dag = NULL;
|
||||
}
|
||||
|
||||
static struct ir3_legalize_state *
|
||||
get_block_legalize_state(struct ir3_block *block)
|
||||
{
|
||||
struct ir3_postsched_block_data *bd = block->data;
|
||||
return bd ? &bd->legalize_state : NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
|
||||
{
|
||||
ctx->block = block;
|
||||
ctx->sy_delay = 0;
|
||||
ctx->ss_delay = 0;
|
||||
struct ir3_postsched_block_data *bd =
|
||||
rzalloc(ctx->mem_ctx, struct ir3_postsched_block_data);
|
||||
block->data = bd;
|
||||
|
||||
ir3_init_legalize_state(&bd->legalize_state, ctx->v->compiler);
|
||||
ir3_merge_pred_legalize_states(&bd->legalize_state, block,
|
||||
get_block_legalize_state);
|
||||
|
||||
/* Initialize the ss/sy_delay by taking the maximum from the predecessors.
|
||||
* TODO: disable carrying over tex prefetch delays from the preamble for now
|
||||
* as this seems to negatively affect nop count and stalls. This should be
|
||||
* revisited in the future.
|
||||
*/
|
||||
if (block != ir3_after_preamble(ctx->ir)) {
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
struct ir3_postsched_block_data *pred_bd = pred->data;
|
||||
|
||||
if (pred_bd) {
|
||||
bd->sy_delay = MAX2(bd->sy_delay, pred_bd->sy_delay);
|
||||
bd->ss_delay = MAX2(bd->ss_delay, pred_bd->ss_delay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* The terminator has to stay at the end. Instead of trying to set up
|
||||
* dependencies to achieve this, it's easier to just remove it now and add it
|
||||
|
|
@ -786,13 +819,19 @@ ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
|
|||
struct ir3_postsched_ctx ctx = {
|
||||
.ir = ir,
|
||||
.v = v,
|
||||
.mem_ctx = ralloc_context(NULL),
|
||||
};
|
||||
|
||||
cleanup_self_movs(ir);
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
block->data = NULL;
|
||||
}
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
sched_block(&ctx, block);
|
||||
}
|
||||
|
||||
ralloc_free(ctx.mem_ctx);
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue