From 7aefd6c1ba2d9ca019bbb2ef283d12be97ca93ba Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 6 May 2021 11:11:09 -0400 Subject: [PATCH] pan/bi: Track liveness while scheduling If we know that a value is killed in the next tuple, there is no need to write it out to the register file. We already handled this as a packing fixup. However, avoiding this write also frees up an extra slot in the register block, which offers additional scheduling freedom. To take advantage of this, we extend liveness analysis to work while scheduling, and modify the schedulable predicate accordingly. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_opt_dce.c | 4 +- src/panfrost/bifrost/bi_schedule.c | 76 +++++++++++++++++++++++------- src/panfrost/bifrost/compiler.h | 3 ++ 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/src/panfrost/bifrost/bi_opt_dce.c b/src/panfrost/bifrost/bi_opt_dce.c index 181bf2b23a3..b91de9bb793 100644 --- a/src/panfrost/bifrost/bi_opt_dce.c +++ b/src/panfrost/bifrost/bi_opt_dce.c @@ -69,7 +69,7 @@ bi_opt_dead_code_eliminate(bi_context *ctx) /* Post-RA liveness-based dead code analysis to clean up results of bundling */ -static uint64_t +uint64_t bi_postra_liveness_ins(uint64_t live, bi_instr *ins) { bi_foreach_dest(ins, d) { @@ -115,7 +115,7 @@ bi_postra_liveness_block(bi_block *blk) * adding the predecessors of the block to the work list if we made progress. */ -static void +void bi_postra_liveness(bi_context *ctx) { struct set *work_list = _mesa_set_create(NULL, diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index a49c882d003..f4a8873bcce 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -679,14 +679,22 @@ bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) } /* Is a register written other than the staging mechanism? ATEST is special, - * writing to both a staging register and a regular register (fixed packing)*/ + * writing to both a staging register and a regular register (fixed packing). + * BLEND is special since it has to write r48 the normal way even if it never + * gets read. This depends on liveness analysis, as a register is not needed + * for a write that will be discarded after one tuple. */ static bool -bi_writes_reg(bi_instr *instr) +bi_writes_reg(bi_instr *instr, uint64_t live_after_temp) { - return (instr->op == BI_OPCODE_ATEST) || - (!bi_is_null(instr->dest[0]) && - !bi_opcode_props[instr->op].sr_write); + if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) + return true; + + if (bi_is_null(instr->dest[0]) || bi_opcode_props[instr->op].sr_write) + return false; + + assert(instr->dest[0].type == BI_INDEX_REGISTER); + return (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) != 0; } /* Instruction placement entails two questions: what subset of instructions in @@ -700,6 +708,7 @@ static bool bi_instr_schedulable(bi_instr *instr, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { /* The units must match */ @@ -763,10 +772,10 @@ bi_instr_schedulable(bi_instr *instr, if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) return false; - /* Register file writes are limited, TODO don't count tempable things */ + /* Register file writes are limited */ unsigned total_writes = tuple->reg.nr_writes; - if (bi_writes_reg(instr)) + if (bi_writes_reg(instr, live_after_temp)) total_writes++; /* Last tuple in a clause can only write a single value */ @@ -794,7 +803,7 @@ bi_instr_schedulable(bi_instr *instr, tuple->prev_reads, tuple->nr_prev_reads); /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ - if (total_writes > MAX2(4 - (signed) succ_reads, 0)) + if ((signed) total_writes > (4 - (signed) succ_reads)) return false; return true; @@ -811,6 +820,7 @@ static unsigned bi_choose_index(struct bi_worklist st, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { unsigned i, best_idx = ~0; @@ -819,7 +829,7 @@ bi_choose_index(struct bi_worklist st, BITSET_FOREACH_SET(i, st.worklist, st.count) { bi_instr *instr = st.instructions[i]; - if (!bi_instr_schedulable(instr, clause, tuple, fma)) + if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) continue; signed cost = bi_instr_cost(instr); @@ -835,7 +845,7 @@ bi_choose_index(struct bi_worklist st, static void bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, - bi_instr *instr, bool fma) + bi_instr *instr, uint64_t live_after_temp, bool fma) { bi_update_fau(clause, tuple, instr, fma, true); @@ -845,7 +855,7 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, clause->access_count += BI_MAX_SRCS; clause->accesses[clause->access_count++] = instr->dest[0]; - if (bi_writes_reg(instr)) + if (bi_writes_reg(instr, live_after_temp)) tuple->reg.nr_writes++; bi_foreach_src(instr, s) { @@ -861,6 +871,7 @@ static bi_instr * bi_take_instr(bi_context *ctx, struct bi_worklist st, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) @@ -880,7 +891,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st, return NULL; #endif - unsigned idx = bi_choose_index(st, clause, tuple, fma); + unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); if (idx >= st.count) return NULL; @@ -890,7 +901,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st, BITSET_CLEAR(st.worklist, idx); bi_update_worklist(st, idx); - bi_pop_instr(clause, tuple, instr, fma); + bi_pop_instr(clause, tuple, instr, live_after_temp, fma); return instr; } @@ -1260,7 +1271,7 @@ bi_apply_constant_modifiers(struct bi_const_state *consts, /* Schedule a single clause. If no instructions remain, return NULL. */ static bi_clause * -bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) +bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live) { struct bi_clause_state clause_state = { 0 }; bi_clause *clause = rzalloc(ctx, bi_clause); @@ -1276,6 +1287,15 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) bi_index prev_reads[5] = { bi_null() }; unsigned nr_prev_reads = 0; + /* We need to track future liveness. The main *live set tracks what is + * live at the current point int he program we are scheduling, but to + * determine temp eligibility, we instead want what will be live after + * the next tuple in the program. If you scheduled forwards, you'd need + * a crystall ball for this. Luckily we schedule backwards, so we just + * delay updates to the live_after_temp by an extra tuple. */ + uint64_t live_after_temp = *live; + uint64_t live_next_tuple = live_after_temp; + do { struct bi_tuple_state tuple_state = { .last = (clause->tuple_count == 0), @@ -1292,11 +1312,27 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) tuple = &clause->tuples[idx]; + if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) { + unsigned nr = bi_count_read_registers(clause->message, 0); + live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value); + } + /* Since we schedule backwards, we schedule ADD first */ - tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, false); - tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, true); + tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false); + tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true); tuple->add = tuple_state.add; + /* Update liveness from the new instructions */ + if (tuple->add) + *live = bi_postra_liveness_ins(*live, tuple->add); + + if (tuple->fma) + *live = bi_postra_liveness_ins(*live, tuple->fma); + + /* Rotate in the new per-tuple liveness */ + live_after_temp = live_next_tuple; + live_next_tuple = *live; + /* We may have a message, but only one per clause */ if (tuple->add && bi_must_message(tuple->add)) { assert(!clause_state.message); @@ -1474,9 +1510,12 @@ bi_schedule_block(bi_context *ctx, bi_block *block) return; } + /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */ + uint64_t live = block->reg_live_out; + /* Schedule as many clauses as needed to fill the block */ bi_clause *u = NULL; - while((u = bi_schedule_clause(ctx, block, st))) + while((u = bi_schedule_clause(ctx, block, st, &live))) list_add(&u->link, &block->clauses); /* Back-to-back bit affects only the last clause of a block, @@ -1577,6 +1616,9 @@ bi_lower_fau(bi_context *ctx) void bi_schedule(bi_context *ctx) { + /* Fed into both scheduling and DCE */ + bi_postra_liveness(ctx); + bi_foreach_block(ctx, block) { bi_block *bblock = (bi_block *) block; bi_schedule_block(ctx, bblock); diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 87083795ea3..36ddee0b46c 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -795,6 +795,9 @@ void bi_compute_liveness(bi_context *ctx); void bi_liveness_ins_update(uint16_t *live, bi_instr *ins, unsigned max); void bi_invalidate_liveness(bi_context *ctx); +void bi_postra_liveness(bi_context *ctx); +uint64_t bi_postra_liveness_ins(uint64_t live, bi_instr *ins); + /* Layout */ bool bi_can_insert_tuple(bi_clause *clause, bool constant);