pan/bi: Track liveness while scheduling

If we know that a value is killed in the next tuple, there is no need to
write it out to the register file. We already handled this as a packing
fixup. However, avoiding this write also frees up an extra slot in the
register block, which offers additional scheduling freedom. To take
advantage of this, we extend liveness analysis to work while scheduling,
and modify the schedulable predicate accordingly.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11123>
This commit is contained in:
Alyssa Rosenzweig 2021-05-06 11:11:09 -04:00 committed by Marge Bot
parent 20809daa9a
commit 7aefd6c1ba
3 changed files with 64 additions and 19 deletions

View file

@ -69,7 +69,7 @@ bi_opt_dead_code_eliminate(bi_context *ctx)
/* Post-RA liveness-based dead code analysis to clean up results of bundling */
static uint64_t
uint64_t
bi_postra_liveness_ins(uint64_t live, bi_instr *ins)
{
bi_foreach_dest(ins, d) {
@ -115,7 +115,7 @@ bi_postra_liveness_block(bi_block *blk)
* adding the predecessors of the block to the work list if we made progress.
*/
static void
void
bi_postra_liveness(bi_context *ctx)
{
struct set *work_list = _mesa_set_create(NULL,

View file

@ -679,14 +679,22 @@ bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
}
/* Is a register written other than the staging mechanism? ATEST is special,
* writing to both a staging register and a regular register (fixed packing)*/
* writing to both a staging register and a regular register (fixed packing).
* BLEND is special since it has to write r48 the normal way even if it never
* gets read. This depends on liveness analysis, as a register is not needed
* for a write that will be discarded after one tuple. */
static bool
bi_writes_reg(bi_instr *instr)
bi_writes_reg(bi_instr *instr, uint64_t live_after_temp)
{
return (instr->op == BI_OPCODE_ATEST) ||
(!bi_is_null(instr->dest[0]) &&
!bi_opcode_props[instr->op].sr_write);
if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
return true;
if (bi_is_null(instr->dest[0]) || bi_opcode_props[instr->op].sr_write)
return false;
assert(instr->dest[0].type == BI_INDEX_REGISTER);
return (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) != 0;
}
/* Instruction placement entails two questions: what subset of instructions in
@ -700,6 +708,7 @@ static bool
bi_instr_schedulable(bi_instr *instr,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
/* The units must match */
@ -763,10 +772,10 @@ bi_instr_schedulable(bi_instr *instr,
if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
return false;
/* Register file writes are limited, TODO don't count tempable things */
/* Register file writes are limited */
unsigned total_writes = tuple->reg.nr_writes;
if (bi_writes_reg(instr))
if (bi_writes_reg(instr, live_after_temp))
total_writes++;
/* Last tuple in a clause can only write a single value */
@ -794,7 +803,7 @@ bi_instr_schedulable(bi_instr *instr,
tuple->prev_reads, tuple->nr_prev_reads);
/* Successor must satisfy R+W <= 4, so we require W <= 4-R */
if (total_writes > MAX2(4 - (signed) succ_reads, 0))
if ((signed) total_writes > (4 - (signed) succ_reads))
return false;
return true;
@ -811,6 +820,7 @@ static unsigned
bi_choose_index(struct bi_worklist st,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
unsigned i, best_idx = ~0;
@ -819,7 +829,7 @@ bi_choose_index(struct bi_worklist st,
BITSET_FOREACH_SET(i, st.worklist, st.count) {
bi_instr *instr = st.instructions[i];
if (!bi_instr_schedulable(instr, clause, tuple, fma))
if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
continue;
signed cost = bi_instr_cost(instr);
@ -835,7 +845,7 @@ bi_choose_index(struct bi_worklist st,
static void
bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
bi_instr *instr, bool fma)
bi_instr *instr, uint64_t live_after_temp, bool fma)
{
bi_update_fau(clause, tuple, instr, fma, true);
@ -845,7 +855,7 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
clause->access_count += BI_MAX_SRCS;
clause->accesses[clause->access_count++] = instr->dest[0];
if (bi_writes_reg(instr))
if (bi_writes_reg(instr, live_after_temp))
tuple->reg.nr_writes++;
bi_foreach_src(instr, s) {
@ -861,6 +871,7 @@ static bi_instr *
bi_take_instr(bi_context *ctx, struct bi_worklist st,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
@ -880,7 +891,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st,
return NULL;
#endif
unsigned idx = bi_choose_index(st, clause, tuple, fma);
unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
if (idx >= st.count)
return NULL;
@ -890,7 +901,7 @@ bi_take_instr(bi_context *ctx, struct bi_worklist st,
BITSET_CLEAR(st.worklist, idx);
bi_update_worklist(st, idx);
bi_pop_instr(clause, tuple, instr, fma);
bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
return instr;
}
@ -1260,7 +1271,7 @@ bi_apply_constant_modifiers(struct bi_const_state *consts,
/* Schedule a single clause. If no instructions remain, return NULL. */
static bi_clause *
bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st)
bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
{
struct bi_clause_state clause_state = { 0 };
bi_clause *clause = rzalloc(ctx, bi_clause);
@ -1276,6 +1287,15 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st)
bi_index prev_reads[5] = { bi_null() };
unsigned nr_prev_reads = 0;
/* We need to track future liveness. The main *live set tracks what is
* live at the current point int he program we are scheduling, but to
* determine temp eligibility, we instead want what will be live after
* the next tuple in the program. If you scheduled forwards, you'd need
* a crystall ball for this. Luckily we schedule backwards, so we just
* delay updates to the live_after_temp by an extra tuple. */
uint64_t live_after_temp = *live;
uint64_t live_next_tuple = live_after_temp;
do {
struct bi_tuple_state tuple_state = {
.last = (clause->tuple_count == 0),
@ -1292,11 +1312,27 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st)
tuple = &clause->tuples[idx];
if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
unsigned nr = bi_count_read_registers(clause->message, 0);
live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
}
/* Since we schedule backwards, we schedule ADD first */
tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, false);
tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, true);
tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
tuple->add = tuple_state.add;
/* Update liveness from the new instructions */
if (tuple->add)
*live = bi_postra_liveness_ins(*live, tuple->add);
if (tuple->fma)
*live = bi_postra_liveness_ins(*live, tuple->fma);
/* Rotate in the new per-tuple liveness */
live_after_temp = live_next_tuple;
live_next_tuple = *live;
/* We may have a message, but only one per clause */
if (tuple->add && bi_must_message(tuple->add)) {
assert(!clause_state.message);
@ -1474,9 +1510,12 @@ bi_schedule_block(bi_context *ctx, bi_block *block)
return;
}
/* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
uint64_t live = block->reg_live_out;
/* Schedule as many clauses as needed to fill the block */
bi_clause *u = NULL;
while((u = bi_schedule_clause(ctx, block, st)))
while((u = bi_schedule_clause(ctx, block, st, &live)))
list_add(&u->link, &block->clauses);
/* Back-to-back bit affects only the last clause of a block,
@ -1577,6 +1616,9 @@ bi_lower_fau(bi_context *ctx)
void
bi_schedule(bi_context *ctx)
{
/* Fed into both scheduling and DCE */
bi_postra_liveness(ctx);
bi_foreach_block(ctx, block) {
bi_block *bblock = (bi_block *) block;
bi_schedule_block(ctx, bblock);

View file

@ -795,6 +795,9 @@ void bi_compute_liveness(bi_context *ctx);
void bi_liveness_ins_update(uint16_t *live, bi_instr *ins, unsigned max);
void bi_invalidate_liveness(bi_context *ctx);
void bi_postra_liveness(bi_context *ctx);
uint64_t bi_postra_liveness_ins(uint64_t live, bi_instr *ins);
/* Layout */
bool bi_can_insert_tuple(bi_clause *clause, bool constant);