diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 53ee9bfec7d..dd1263d3a60 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1288,6 +1288,26 @@ reg_size(const struct ir3_register *reg) return reg_elems(reg) * reg_elem_size(reg); } +/* Post-RA, we don't have arrays any more, so we have to be a bit careful here + * and have to handle relative accesses specially. + */ + +static inline unsigned +post_ra_reg_elems(struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_RELATIV) + return reg->size; + return reg_elems(reg); +} + +static inline unsigned +post_ra_reg_num(struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_RELATIV) + return reg->array.base; + return reg->num; +} + static inline unsigned dest_regs(struct ir3_instruction *instr) { @@ -1871,8 +1891,6 @@ int ir3_delayslots(struct ir3_instruction *assigner, unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner, struct ir3_instruction *consumer, unsigned assigner_n, unsigned consumer_n); -unsigned ir3_delay_calc(struct ir3_block *block, - struct ir3_instruction *instr, bool mergedregs); /* estimated (ss)/(sy) delay calculation */ diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index db5b5871c48..735e5ce1738 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -95,38 +95,6 @@ ir3_delayslots(struct ir3_instruction *assigner, } } -static bool -count_instruction(struct ir3_instruction *n) -{ - /* NOTE: don't count branch/jump since we don't know yet if they will - * be eliminated later in resolve_jumps().. really should do that - * earlier so we don't have this constraint. - */ - return is_alu(n) || - (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && - (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO)); -} - -/* Post-RA, we don't have arrays any more, so we have to be a bit careful here - * and have to handle relative accesses specially. - */ - -static unsigned -post_ra_reg_elems(struct ir3_register *reg) -{ - if (reg->flags & IR3_REG_RELATIV) - return reg->size; - return reg_elems(reg); -} - -static unsigned -post_ra_reg_num(struct ir3_register *reg) -{ - if (reg->flags & IR3_REG_RELATIV) - return reg->array.base; - return reg->num; -} - unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner, struct ir3_instruction *consumer, @@ -211,128 +179,3 @@ ir3_delayslots_with_repeat(struct ir3_instruction *assigner, return offset > delay ? 0 : delay - offset; } -static unsigned -delay_calc_srcn(struct ir3_instruction *assigner, - struct ir3_instruction *consumer, unsigned assigner_n, - unsigned consumer_n, bool mergedregs) -{ - struct ir3_register *src = consumer->srcs[consumer_n]; - struct ir3_register *dst = assigner->dsts[assigner_n]; - bool mismatched_half = - (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF); - - /* In the mergedregs case or when the register is a special register, - * half-registers do not alias with full registers. - */ - if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) && - mismatched_half) - return 0; - - unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src); - unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src); - unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst); - unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst); - - if (dst_start >= src_end || src_start >= dst_end) - return 0; - - return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n); -} - -static unsigned -delay_calc(struct ir3_block *block, struct ir3_instruction *start, - struct ir3_instruction *consumer, unsigned distance, - regmask_t *in_mask, bool mergedregs) -{ - regmask_t mask; - memcpy(&mask, in_mask, sizeof(mask)); - - unsigned delay = 0; - /* Search backwards starting at the instruction before start, unless it's - * NULL then search backwards from the block end. - */ - struct list_head *start_list = - start ? start->node.prev : block->instr_list.prev; - list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list, - &block->instr_list, node) { - if (count_instruction(assigner)) - distance += assigner->nop; - - if (distance + delay >= MAX_NOPS) - return delay; - - if (is_meta(assigner)) - continue; - - unsigned new_delay = 0; - - foreach_dst_n (dst, dst_n, assigner) { - if (dst->wrmask == 0) - continue; - if (!regmask_get(&mask, dst)) - continue; - foreach_src_n (src, src_n, consumer) { - if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) - continue; - - unsigned src_delay = delay_calc_srcn( - assigner, consumer, dst_n, src_n, mergedregs); - new_delay = MAX2(new_delay, src_delay); - } - regmask_clear(&mask, dst); - } - - new_delay = new_delay > distance ? new_delay - distance : 0; - delay = MAX2(delay, new_delay); - - if (count_instruction(assigner)) - distance += 1 + assigner->repeat; - } - - /* Note: this allows recursion into "block" if it has already been - * visited, but *not* recursion into its predecessors. We may have to - * visit the original block twice, for the loop case where we have to - * consider definititons in an earlier iterations of the same loop: - * - * while (...) { - * mov.u32u32 ..., r0.x - * ... - * mov.u32u32 r0.x, ... - * } - * - * However any other recursion would be unnecessary. - */ - - if (block->data != block) { - block->data = block; - - for (unsigned i = 0; i < block->predecessors_count; i++) { - struct ir3_block *pred = block->predecessors[i]; - unsigned pred_delay = delay_calc(pred, NULL, consumer, distance, - &mask, mergedregs); - delay = MAX2(delay, pred_delay); - } - - block->data = NULL; - } - - return delay; -} - -/** - * Calculate delay for nop insertion. This must exactly match hardware - * requirements, including recursing into predecessor blocks. - */ -unsigned -ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, - bool mergedregs) -{ - regmask_t mask; - regmask_init(&mask, mergedregs); - foreach_src (src, instr) { - if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST))) - regmask_set(&mask, src); - } - - return delay_calc(block, NULL, instr, 0, &mask, mergedregs); -} diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 53e9c51fd49..b8148c236d9 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -54,11 +54,37 @@ struct ir3_legalize_ctx { bool has_inputs; }; +struct ir3_nop_state { + unsigned full_ready[4 * 48]; + unsigned half_ready[4 * 48]; +}; + struct ir3_legalize_state { regmask_t needs_ss; regmask_t needs_ss_war; /* write after read */ regmask_t needs_sy; bool needs_ss_for_const; + + /* Each of these arrays contains the cycle when the corresponding register + * becomes "ready" i.e. does not require any more nops. There is a special + * mechanism to let ALU instructions read compatible (i.e. same halfness) + * destinations of another ALU instruction with less delay, so this can + * depend on what type the consuming instruction is, which is why there are + * multiple arrays. The cycle is counted relative to the start of the block. + */ + + /* When ALU instructions reading the given full/half register will be ready. + */ + struct ir3_nop_state alu_nop; + + /* When non-ALU (e.g. cat5) instructions reading the given full/half register + * will be ready. + */ + struct ir3_nop_state non_alu_nop; + + /* When p0.x-w, a0.x, and a1.x are ready. */ + unsigned pred_ready[4]; + unsigned addr_ready[2]; }; struct ir3_legalize_block_data { @@ -87,6 +113,177 @@ apply_sy(struct ir3_instruction *instr, regmask_init(&state->needs_sy, mergedregs); } +static bool +count_instruction(struct ir3_instruction *n) +{ + /* NOTE: don't count branch/jump since we don't know yet if they will + * be eliminated later in resolve_jumps().. really should do that + * earlier so we don't have this constraint. + */ + return is_alu(n) || + (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) && + (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO)); +} + +static unsigned * +get_ready_slot(struct ir3_legalize_state *state, + struct ir3_register *reg, unsigned num, + bool consumer_alu, bool matching_size) +{ + if (reg->flags & IR3_REG_PREDICATE) { + assert(num == reg->num); + assert(reg_num(reg) == REG_P0); + return &state->pred_ready[reg_comp(reg)]; + } + if (reg->num == regid(REG_A0, 0)) + return &state->addr_ready[0]; + if (reg->num == regid(REG_A0, 1)) + return &state->addr_ready[1]; + struct ir3_nop_state *nop = + consumer_alu ? &state->alu_nop : &state->non_alu_nop; + assert(!(reg->flags & IR3_REG_SHARED)); + if (reg->flags & IR3_REG_HALF) { + if (matching_size) + return &nop->half_ready[num]; + else + return &nop->full_ready[num / 2]; + } else { + if (matching_size) + return &nop->full_ready[num]; + /* If "num" is large enough, then it can't alias a half-reg because only + * the first half of the full reg speace aliases half regs. Return NULL in + * this case. + */ + else if (num * 2 < ARRAY_SIZE(nop->half_ready)) + return &nop->half_ready[num * 2]; + else + return NULL; + } +} + +static unsigned +delay_calc(struct ir3_legalize_state *state, + struct ir3_instruction *instr, + unsigned cycle) +{ + /* As far as we know, shader outputs don't need any delay. */ + if (instr->opc == OPC_END || instr->opc == OPC_CHMASK) + return 0; + + unsigned delay = 0; + foreach_src_n (src, n, instr) { + if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) + continue; + + unsigned elems = post_ra_reg_elems(src); + unsigned num = post_ra_reg_num(src); + unsigned src_cycle = cycle; + + /* gat and swz have scalar sources and each source is read in a + * subsequent cycle. + */ + if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ) + src_cycle += n; + + /* cat3 instructions consume their last source two cycles later, so they + * only need a delay of 1. + */ + if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2) + src_cycle += 2; + + for (unsigned elem = 0; elem < elems; elem++, num++) { + unsigned ready_cycle = + *get_ready_slot(state, src, num, is_alu(instr), true); + delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle); + + /* Increment cycle for ALU instructions with (rptN) where sources are + * read each subsequent cycle. + */ + if (instr->repeat && !(src->flags & IR3_REG_RELATIV)) + src_cycle++; + } + } + + return delay; +} + +static void +delay_update(struct ir3_legalize_state *state, + struct ir3_instruction *instr, + unsigned cycle, + bool mergedregs) +{ + foreach_dst_n (dst, n, instr) { + unsigned elems = post_ra_reg_elems(dst); + unsigned num = post_ra_reg_num(dst); + unsigned dst_cycle = cycle; + + /* sct and swz have scalar destinations and each destination is written in + * a subsequent cycle. + */ + if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ) + dst_cycle += n; + + /* For relative accesses with (rptN), we have no way of knowing which + * component is accessed when, so we have to assume the worst and mark + * every array member as being written at the end. + */ + if (dst->flags & IR3_REG_RELATIV) + dst_cycle += instr->repeat; + + if (dst->flags & IR3_REG_SHARED) + continue; + + for (unsigned elem = 0; elem < elems; elem++, num++) { + for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) { + for (unsigned matching_size = 0; matching_size < 2; matching_size++) { + unsigned *ready_slot = + get_ready_slot(state, dst, num, consumer_alu, matching_size); + + if (!ready_slot) + continue; + + bool reset_ready_slot = false; + unsigned delay = 0; + if (!is_alu(instr)) { + /* Apparently writes that require (ss) or (sy) are + * synchronized against previous writes, so consumers don't + * have to wait for any previous overlapping ALU instructions + * to complete. + */ + reset_ready_slot = true; + } else if ((dst->flags & IR3_REG_PREDICATE) || + reg_num(dst) == REG_A0) { + delay = 6; + if (!matching_size) + continue; + } else { + delay = (consumer_alu && matching_size) ? 3 : 6; + } + + if (!matching_size) { + for (unsigned i = 0; i < reg_elem_size(dst); i++) { + ready_slot[i] = + reset_ready_slot ? 0 : + MAX2(ready_slot[i], dst_cycle + delay); + } + } else { + *ready_slot = + reset_ready_slot ? 0 : + MAX2(*ready_slot, dst_cycle + delay); + } + } + } + + /* Increment cycle for ALU instructions with (rptN) where destinations + * are written each subsequent cycle. + */ + if (instr->repeat && !(dst->flags & IR3_REG_RELATIV)) + dst_cycle++; + } + } +} + /* We want to evaluate each block from the position of any other * predecessor block, in order that the flags set are the union of * all possible program paths. @@ -140,6 +337,21 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) &pstate->needs_ss_war); regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy); state->needs_ss_for_const |= pstate->needs_ss_for_const; + + /* Our nop state is the max of the predecessor blocks */ + for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++) + state->pred_ready[i] = MAX2(state->pred_ready[i], + pstate->pred_ready[i]); + for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) { + state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i], + pstate->alu_nop.full_ready[i]); + state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i], + pstate->alu_nop.half_ready[i]); + state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i], + pstate->non_alu_nop.full_ready[i]); + state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i], + pstate->non_alu_nop.half_ready[i]); + } } /* We need to take phsyical-only edges into account when tracking shared @@ -178,6 +390,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_replace(&block->instr_list, &instr_list); list_inithead(&block->instr_list); + unsigned cycle = 0; + foreach_instr_safe (n, &instr_list) { unsigned i; @@ -257,11 +471,40 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) nop = ir3_NOP(block); nop->flags |= IR3_INSTR_SS; n->flags &= ~IR3_INSTR_SS; + last_n = nop; + cycle++; } - /* need to be able to set (ss) on first instruction: */ - if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n)) - ir3_NOP(block); + unsigned delay = delay_calc(state, n, cycle); + + /* NOTE: I think the nopN encoding works for a5xx and + * probably a4xx, but not a3xx. So far only tested on + * a6xx. + */ + + if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n && + ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) && + (last_n->repeat == 0)) { + /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ + unsigned transfer = MIN2(delay, 3 - last_n->nop); + last_n->nop += transfer; + delay -= transfer; + cycle += transfer; + } + + if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) { + /* the previous nop can encode at most 5 repeats: */ + unsigned transfer = MIN2(delay, 5 - last_n->repeat); + last_n->repeat += transfer; + delay -= transfer; + cycle += transfer; + } + + if (delay > 0) { + assert(delay <= 6); + ir3_NOP(block)->repeat = delay - 1; + cycle += delay; + } if (ctx->compiler->samgq_workaround && ctx->type != MESA_SHADER_FRAGMENT && @@ -328,6 +571,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } } + if (count_instruction(n)) + cycle += 1; + + delay_update(state, n, cycle, mergedregs); + + if (count_instruction(n)) + cycle += n->repeat; + if (ctx->early_input_release && is_input(n)) { last_input_needs_ss |= (n->opc == OPC_LDLV); @@ -384,6 +635,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) list_add(&baryf->node, &block->instr_list); } + /* Currently our nop state contains the cycle offset from the start of this + * block when each register becomes ready. But successor blocks need the + * cycle offset from their start, which is this block's end. Translate the + * cycle offset. + */ + for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++) + state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle; + for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) { + state->alu_nop.full_ready[i] = + MAX2(state->alu_nop.full_ready[i], cycle) - cycle; + state->alu_nop.half_ready[i] = + MAX2(state->alu_nop.half_ready[i], cycle) - cycle; + state->non_alu_nop.full_ready[i] = + MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle; + state->non_alu_nop.half_ready[i] = + MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle; + } + bd->valid = true; if (memcmp(&prev_state, state, sizeof(*state))) { @@ -407,8 +676,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * dsxpp.1.p dst, src * * We apply this after flags syncing, as we don't want to sync in between the - * two (which might happen if dst == src). We do it before nop scheduling - * because that needs to count actual instructions. + * two (which might happen if dst == src). */ static bool apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) @@ -865,55 +1133,6 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so) } } -/* Insert nop's required to make this a legal/valid shader program: */ -static void -nop_sched(struct ir3 *ir, struct ir3_shader_variant *so) -{ - foreach_block (block, &ir->block_list) { - struct ir3_instruction *last = NULL; - struct list_head instr_list; - - /* remove all the instructions from the list, we'll be adding - * them back in as we go - */ - list_replace(&block->instr_list, &instr_list); - list_inithead(&block->instr_list); - - foreach_instr_safe (instr, &instr_list) { - unsigned delay = ir3_delay_calc(block, instr, so->mergedregs); - - /* NOTE: I think the nopN encoding works for a5xx and - * probably a4xx, but not a3xx. So far only tested on - * a6xx. - */ - - if ((delay > 0) && (ir->compiler->gen >= 6) && last && - ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) && - (last->repeat == 0)) { - /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ - unsigned transfer = MIN2(delay, 3 - last->nop); - last->nop += transfer; - delay -= transfer; - } - - if ((delay > 0) && last && (last->opc == OPC_NOP)) { - /* the previous nop can encode at most 5 repeats: */ - unsigned transfer = MIN2(delay, 5 - last->repeat); - last->repeat += transfer; - delay -= transfer; - } - - if (delay > 0) { - assert(delay <= 6); - ir3_NOP(block)->repeat = delay - 1; - } - - list_addtail(&instr->node, &block->instr_list); - last = instr; - } - } -} - static void dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so) { @@ -1227,8 +1446,6 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) progress |= apply_fine_deriv_macro(ctx, block); } - nop_sched(ir, so); - if (ir3_shader_debug & IR3_DBG_FULLSYNC) { dbg_sync_sched(ir, so); } diff --git a/src/freedreno/ir3/tests/delay.c b/src/freedreno/ir3/tests/delay.c index 66e14c092a8..eab6a229231 100644 --- a/src/freedreno/ir3/tests/delay.c +++ b/src/freedreno/ir3/tests/delay.c @@ -145,6 +145,30 @@ fixup_wrmask(struct ir3 *ir) } } +/* Calculate the number of nops added before the last instruction by + * ir3_legalize. + */ +static unsigned +calc_nops(struct ir3_block *block, struct ir3_instruction *last) +{ + unsigned nops = 0; + + foreach_instr_rev (instr, &block->instr_list) { + if (instr == last) + continue; + + if (instr->opc == OPC_NOP) { + nops += 1 + instr->repeat; + } else { + if (is_alu(instr)) + nops += instr->nop; + break; + } + } + + return nops; +} + int main(int argc, char **argv) { @@ -177,13 +201,10 @@ main(int argc, char **argv) break; } - /* The delay calc is expecting the instr to not yet be added to the - * block, so remove it from the block so that it doesn't get counted - * in the distance from assigner: - */ - list_delinit(&last->node); + int max_bary; + ir3_legalize(ir, shader->variants, &max_bary); - unsigned n = ir3_delay_calc(block, last, true); + unsigned n = calc_nops(block, last); if (n != test->expected_delay) { printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,