diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 4d23a80c15a..571a89fb7be 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -459,6 +459,8 @@ struct choose_scoreboard { int last_uniforms_reset_tick; int last_thrsw_tick; bool tlb_locked; + bool ldvary_pipelining; + bool fixup_ldvary; }; static bool @@ -890,6 +892,20 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, dag.link) { + /* If we are scheduling a pipelined smooth varying sequence then + * we want to pick up the next instruction in the sequence. + */ + if (scoreboard->ldvary_pipelining && + !n->inst->ldvary_pipelining) { + continue; + } + + /* Sanity check: if we are scheduling a smooth ldvary sequence + * we cannot be starting another sequence in the middle of it. + */ + assert(!scoreboard->ldvary_pipelining || + !n->inst->ldvary_pipelining_start); + const struct v3d_qpu_instr *inst = &n->inst->qpu; /* Simulator complains if we have two uniforms loaded in the @@ -946,12 +962,6 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * sooner. If the ldvary's r5 wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update * r5 in the same tick. - * - * XXX perf: To get good pipelining of a sequence of varying - * loads, we need to figure out how to pair the ldvary signal - * up to the instruction before the last r5 user in the - * previous ldvary sequence. Currently, it usually pairs with - * the last r5 user. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -985,6 +995,16 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, &prev_inst->inst->qpu, inst)) { continue; } + + /* If we find an ldvary inside an ongoing pipelineable + * ldvary sequence we want to pick that and start + * pipelining the new sequence into the previous one. + */ + if (scoreboard->ldvary_pipelining && inst->sig.ldvary) { + assert(n->inst->ldvary_pipelining); + scoreboard->fixup_ldvary = true; + return n; + } } int prio = get_instruction_priority(devinfo, inst); @@ -1025,6 +1045,26 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, } } + /* If we are in the middle of an ldvary sequence we only pick up + * instructions that can continue the sequence so we can pipeline + * them, however, if we failed to find anything to schedule then we + * can't possibly continue the sequence and we need to stop the + * pipelining process and try again. + */ + if (scoreboard->ldvary_pipelining && !prev_inst && !chosen) { + scoreboard->ldvary_pipelining = false; + chosen = choose_instruction_to_schedule(devinfo, scoreboard, prev_inst); + } else if (chosen) { + if (scoreboard->ldvary_pipelining) { + assert(chosen->inst->ldvary_pipelining); + if (chosen->inst->ldvary_pipelining_end) + scoreboard->ldvary_pipelining = false; + } else if (chosen->inst->ldvary_pipelining_start) { + assert(chosen->inst->qpu.sig.ldvary); + scoreboard->ldvary_pipelining = true; + } + } + return chosen; } @@ -1460,6 +1500,144 @@ emit_thrsw(struct v3d_compile *c, return time; } +static bool +alu_reads_register(struct v3d_qpu_instr *inst, + bool add, bool magic, uint32_t index) +{ + uint32_t num_src; + enum v3d_qpu_mux mux_a, mux_b; + + if (add) { + num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); + mux_a = inst->alu.add.a; + mux_b = inst->alu.add.b; + } else { + num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + mux_a = inst->alu.mul.a; + mux_b = inst->alu.mul.b; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + } + } + + return false; +} + +/** + * This takes and ldvary signal merged into 'inst' and tries to move it up to + * the previous instruction to get good pipelining of ldvary sequences, + * transforming this: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; + * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst + * + * into: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; ldvary.r1 + * fadd rf13, r0, r5 ; nop; ; <-- inst + * + * If we manage to do this successfully (we return true here), then flagging + * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that + * we will be able to pick up to merge into 'inst', leading to code like this: + * + * nop ; nop ; ldvary.r4 + * nop ; fmul r0, r4, rf0 ; ldvary.r1 + * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst + */ +static bool +fixup_pipelined_ldvary(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, + struct v3d_qpu_instr *inst) +{ + /* We only call this if we have successfuly merged an ldvary into a + * previous instruction. + */ + assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); + assert(inst->sig.ldvary); + uint32_t ldvary_magic = inst->sig_magic; + uint32_t ldvary_index = inst->sig_addr; + + /* The instruction in which we merged the ldvary cannot read + * the ldvary destination, if it does, then moving the ldvary before + * it would overwrite it. + */ + if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + return false; + if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + return false; + + /* The previous instruction can't write to the same destination as the + * ldvary. + */ + struct qinst *prev = (struct qinst *) block->instructions.prev; + if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) { + if (prev->qpu.alu.add.magic_write == ldvary_magic && + prev->qpu.alu.add.waddr == ldvary_index) { + return false; + } + } + + if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) { + if (prev->qpu.alu.mul.magic_write == ldvary_magic && + prev->qpu.alu.mul.waddr == ldvary_index) { + return false; + } + } + + /* The previous instruction cannot have a conflicting signal */ + if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + return false; + + /* The previous instruction cannot use flags since ldvary uses the + * 'cond' instruction field to store the destination. + */ + if (v3d_qpu_writes_flags(&prev->qpu)) + return false; + + /* Move the ldvary to the previous instruction and remove it from the + * current one. + */ + prev->qpu.sig.ldvary = true; + prev->qpu.sig_magic = ldvary_magic; + prev->qpu.sig_addr = ldvary_index; + scoreboard->last_ldvary_tick = scoreboard->tick - 1; + + inst->sig.ldvary = false; + inst->sig_magic = false; + inst->sig_addr = 0; + + return true; +} + static uint32_t schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, @@ -1530,6 +1708,21 @@ schedule_instructions(struct v3d_compile *c, v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); } + + if (scoreboard->fixup_ldvary) { + assert(scoreboard->ldvary_pipelining); + scoreboard->fixup_ldvary = false; + if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) { + /* Flag the ldvary as scheduled + * now so we can try to merge the + * follow-up fmul into the current + * instruction. + */ + mark_instruction_scheduled( + devinfo, scoreboard->dag, + time, merge); + } + } } if (mux_read_stalls(scoreboard, inst)) c->qpu_inst_stalled_count++;