diff --git a/src/panfrost/bifrost/bi_opt_dce.c b/src/panfrost/bifrost/bi_opt_dce.c index 181bf2b23a3..b91de9bb793 100644 --- a/src/panfrost/bifrost/bi_opt_dce.c +++ b/src/panfrost/bifrost/bi_opt_dce.c @@ -69,7 +69,7 @@ bi_opt_dead_code_eliminate(bi_context *ctx) /* Post-RA liveness-based dead code analysis to clean up results of bundling */ -static uint64_t +uint64_t bi_postra_liveness_ins(uint64_t live, bi_instr *ins) { bi_foreach_dest(ins, d) { @@ -115,7 +115,7 @@ bi_postra_liveness_block(bi_block *blk) * adding the predecessors of the block to the work list if we made progress. */ -static void +void bi_postra_liveness(bi_context *ctx) { struct set *work_list = _mesa_set_create(NULL, diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index a49c882d003..f4a8873bcce 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -679,14 +679,22 @@ bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) } /* Is a register written other than the staging mechanism? ATEST is special, - * writing to both a staging register and a regular register (fixed packing)*/ + * writing to both a staging register and a regular register (fixed packing). + * BLEND is special since it has to write r48 the normal way even if it never + * gets read. This depends on liveness analysis, as a register is not needed + * for a write that will be discarded after one tuple. */ static bool -bi_writes_reg(bi_instr *instr) +bi_writes_reg(bi_instr *instr, uint64_t live_after_temp) { - return (instr->op == BI_OPCODE_ATEST) || - (!bi_is_null(instr->dest[0]) && - !bi_opcode_props[instr->op].sr_write); + if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) + return true; + + if (bi_is_null(instr->dest[0]) || bi_opcode_props[instr->op].sr_write) + return false; + + assert(instr->dest[0].type == BI_INDEX_REGISTER); + return (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) != 0; } /* Instruction placement entails two questions: what subset of instructions in @@ -700,6 +708,7 @@ static bool bi_instr_schedulable(bi_instr *instr, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { /* The units must match */ @@ -763,10 +772,10 @@ bi_instr_schedulable(bi_instr *instr, if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) return false; - /* Register file writes are limited, TODO don't count tempable things */ + /* Register file writes are limited */ unsigned total_writes = tuple->reg.nr_writes; - if (bi_writes_reg(instr)) + if (bi_writes_reg(instr, live_after_temp)) total_writes++; /* Last tuple in a clause can only write a single value */ @@ -794,7 +803,7 @@ bi_instr_schedulable(bi_instr *instr, tuple->prev_reads, tuple->nr_prev_reads); /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ - if (total_writes > MAX2(4 - (signed) succ_reads, 0)) + if ((signed) total_writes > (4 - (signed) succ_reads)) return false; return true; @@ -811,6 +820,7 @@ static unsigned bi_choose_index(struct bi_worklist st, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { unsigned i, best_idx = ~0; @@ -819,7 +829,7 @@ bi_choose_index(struct bi_worklist st, BITSET_FOREACH_SET(i, st.worklist, st.count) { bi_instr *instr = st.instructions[i]; - if (!bi_instr_schedulable(instr, clause, tuple, fma)) + if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) continue; signed cost = bi_instr_cost(instr); @@ -835,7 +845,7 @@ bi_choose_index(struct bi_worklist st, static void bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, - bi_instr *instr, bool fma) + bi_instr *instr, uint64_t live_after_temp, bool fma) { bi_update_fau(clause, tuple, instr, fma, true); @@ -845,7 +855,7 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, clause->access_count += BI_MAX_SRCS; clause->accesses[clause->access_count++] = instr->dest[0]; - if (bi_writes_reg(instr)) + if (bi_writes_reg(instr, live_after_temp)) tuple->reg.nr_writes++; bi_foreach_src(instr, s) { @@ -861,6 +871,7 @@ static bi_instr * bi_take_instr(bi_context *ctx, struct bi_worklist st, struct bi_clause_state *clause, struct bi_tuple_state *tuple, + uint64_t live_after_temp, bool fma) { if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) @@ -880,7 +891,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st, return NULL; #endif - unsigned idx = bi_choose_index(st, clause, tuple, fma); + unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); if (idx >= st.count) return NULL; @@ -890,7 +901,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st, BITSET_CLEAR(st.worklist, idx); bi_update_worklist(st, idx); - bi_pop_instr(clause, tuple, instr, fma); + bi_pop_instr(clause, tuple, instr, live_after_temp, fma); return instr; } @@ -1260,7 +1271,7 @@ bi_apply_constant_modifiers(struct bi_const_state *consts, /* Schedule a single clause. If no instructions remain, return NULL. */ static bi_clause * -bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) +bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live) { struct bi_clause_state clause_state = { 0 }; bi_clause *clause = rzalloc(ctx, bi_clause); @@ -1276,6 +1287,15 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) bi_index prev_reads[5] = { bi_null() }; unsigned nr_prev_reads = 0; + /* We need to track future liveness. The main *live set tracks what is + * live at the current point int he program we are scheduling, but to + * determine temp eligibility, we instead want what will be live after + * the next tuple in the program. If you scheduled forwards, you'd need + * a crystall ball for this. Luckily we schedule backwards, so we just + * delay updates to the live_after_temp by an extra tuple. */ + uint64_t live_after_temp = *live; + uint64_t live_next_tuple = live_after_temp; + do { struct bi_tuple_state tuple_state = { .last = (clause->tuple_count == 0), @@ -1292,11 +1312,27 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st) tuple = &clause->tuples[idx]; + if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) { + unsigned nr = bi_count_read_registers(clause->message, 0); + live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value); + } + /* Since we schedule backwards, we schedule ADD first */ - tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, false); - tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, true); + tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false); + tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true); tuple->add = tuple_state.add; + /* Update liveness from the new instructions */ + if (tuple->add) + *live = bi_postra_liveness_ins(*live, tuple->add); + + if (tuple->fma) + *live = bi_postra_liveness_ins(*live, tuple->fma); + + /* Rotate in the new per-tuple liveness */ + live_after_temp = live_next_tuple; + live_next_tuple = *live; + /* We may have a message, but only one per clause */ if (tuple->add && bi_must_message(tuple->add)) { assert(!clause_state.message); @@ -1474,9 +1510,12 @@ bi_schedule_block(bi_context *ctx, bi_block *block) return; } + /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */ + uint64_t live = block->reg_live_out; + /* Schedule as many clauses as needed to fill the block */ bi_clause *u = NULL; - while((u = bi_schedule_clause(ctx, block, st))) + while((u = bi_schedule_clause(ctx, block, st, &live))) list_add(&u->link, &block->clauses); /* Back-to-back bit affects only the last clause of a block, @@ -1577,6 +1616,9 @@ bi_lower_fau(bi_context *ctx) void bi_schedule(bi_context *ctx) { + /* Fed into both scheduling and DCE */ + bi_postra_liveness(ctx); + bi_foreach_block(ctx, block) { bi_block *bblock = (bi_block *) block; bi_schedule_block(ctx, bblock); diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 87083795ea3..36ddee0b46c 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -795,6 +795,9 @@ void bi_compute_liveness(bi_context *ctx); void bi_liveness_ins_update(uint16_t *live, bi_instr *ins, unsigned max); void bi_invalidate_liveness(bi_context *ctx); +void bi_postra_liveness(bi_context *ctx); +uint64_t bi_postra_liveness_ins(uint64_t live, bi_instr *ins); + /* Layout */ bool bi_can_insert_tuple(bi_clause *clause, bool constant);