vc4: Add support for scheduling of branch instructions.

For now we don't fill the delay slots, and instead just drop in NOPs.
2026-01-04 02:40:11 +01:00 · 2016-04-27 12:14:07 -07:00 · 2016-04-27 12:14:07 -07:00 · 44df061aaa
commit 44df061aaa
parent a59da513d3
2 changed files with 114 additions and 17 deletions
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@ -362,6 +362,17 @@ struct qblock {

        int index;

+        /* Instruction IPs for the first and last instruction of the block.
+         * Set by vc4_qpu_schedule.c.
+         */
+        uint32_t start_qpu_ip;
+        uint32_t end_qpu_ip;
+
+        /* Instruction IP for the branch instruction of the block.  Set by
+         * vc4_qpu_schedule.c.
+         */
+        uint32_t branch_qpu_ip;
+
        /** @{ used by vc4_qir_live_variables.c */
        BITSET_WORD *def;
        BITSET_WORD *use;
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)

        if (sig != QPU_SIG_LOAD_IMM) {
                process_raddr_deps(state, n, raddr_a, true);
-                if (sig != QPU_SIG_SMALL_IMM)
+                if (sig != QPU_SIG_SMALL_IMM &&
+                    sig != QPU_SIG_BRANCH)
                        process_raddr_deps(state, n, raddr_b, false);
        }

@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                add_read_dep(state, state->last_tlb, n);
                break;

+        case QPU_SIG_BRANCH:
+                add_read_dep(state, state->last_sf, n);
+                break;
+
        case QPU_SIG_PROG_END:
        case QPU_SIG_WAIT_FOR_SCOREBOARD:
        case QPU_SIG_SCOREBOARD_UNLOCK:
        case QPU_SIG_COVERAGE_LOAD:
        case QPU_SIG_COLOR_LOAD_END:
        case QPU_SIG_ALPHA_MASK_LOAD:
-        case QPU_SIG_BRANCH:
                fprintf(stderr, "Unhandled signal bits %d\n", sig);
                abort();
        }

        process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
        process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
-        if (inst & QPU_SF)
+        if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
                add_write_dep(state, &state->last_sf, n);
 }

@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                uint64_t inst = n->inst->inst;

+                /* Don't choose the branch instruction until it's the last one
+                 * left.  XXX: We could potentially choose it before it's the
+                 * last one, if the remaining instructions fit in the delay
+                 * slots.
+                 */
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+                    !list_is_singular(schedule_list)) {
+                        continue;
+                }
+
                /* "An instruction must not read from a location in physical
                 *  regfile A or B that was written to by the previous
                 *  instruction."
@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list,
 }

 static uint32_t
-schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
+schedule_instructions(struct vc4_compile *c,
+                      struct choose_scoreboard *scoreboard,
+                      struct qblock *block,
+                      struct list_head *schedule_list,
                      enum quniform_contents *orig_uniform_contents,
                      uint32_t *orig_uniform_data,
                      uint32_t *next_uniform)
 {
-        struct choose_scoreboard scoreboard;
        uint32_t time = 0;

-        memset(&scoreboard, 0, sizeof(scoreboard));
-        scoreboard.last_waddr_a = ~0;
-        scoreboard.last_waddr_b = ~0;
-        scoreboard.last_sfu_write_tick = -10;
-
        if (debug) {
                fprintf(stderr, "initial deps:\n");
                dump_state(schedule_list);
@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,

        while (!list_empty(schedule_list)) {
                struct schedule_node *chosen =
-                        choose_instruction_to_schedule(&scoreboard,
+                        choose_instruction_to_schedule(scoreboard,
                                                       schedule_list,
                                                       NULL);
                struct schedule_node *merge = NULL;
@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
                                (*next_uniform)++;
                        }

-                        merge = choose_instruction_to_schedule(&scoreboard,
+                        merge = choose_instruction_to_schedule(scoreboard,
                                                               schedule_list,
                                                               chosen);
                        if (merge) {
@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,

                qpu_serialize_one_inst(c, inst);

-                update_scoreboard_for_chosen(&scoreboard, inst);
+                update_scoreboard_for_chosen(scoreboard, inst);

                /* Now that we've scheduled a new instruction, some of its
                 * children can be promoted to the list of instructions ready to
@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
                mark_instruction_scheduled(schedule_list, time, chosen, false);
                mark_instruction_scheduled(schedule_list, time, merge, false);

-                scoreboard.tick++;
+                scoreboard->tick++;
                time++;
+
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
+                        block->branch_qpu_ip = c->qpu_inst_count - 1;
+                        /* Fill the delay slots.
+                         *
+                         * We should fill these with actual instructions,
+                         * instead, but that will probably need to be done
+                         * after this, once we know what the leading
+                         * instructions of the successors are (so we can
+                         * handle A/B register file write latency)
+                        */
+                        inst = qpu_NOP();
+                        update_scoreboard_for_chosen(scoreboard, inst);
+                        qpu_serialize_one_inst(c, inst);
+                        qpu_serialize_one_inst(c, inst);
+                        qpu_serialize_one_inst(c, inst);
+                }
        }

        return time;
 }

 static uint32_t
-qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
+qpu_schedule_instructions_block(struct vc4_compile *c,
+                                struct choose_scoreboard *scoreboard,
+                                struct qblock *block,
                                enum quniform_contents *orig_uniform_contents,
                                uint32_t *orig_uniform_data,
                                uint32_t *next_uniform)
@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
                compute_delay(n);
        }

-        uint32_t cycles = schedule_instructions(c, &schedule_list,
+        uint32_t cycles = schedule_instructions(c, scoreboard, block,
+                                                &schedule_list,
                                                orig_uniform_contents,
                                                orig_uniform_data,
                                                next_uniform);
@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
        return cycles;
 }

+static void
+qpu_set_branch_targets(struct vc4_compile *c)
+{
+        qir_for_each_block(block, c) {
+                /* The end block of the program has no branch. */
+                if (!block->successors[0])
+                        continue;
+
+                /* If there was no branch instruction, then the successor
+                 * block must follow immediately after this one.
+                 */
+                if (block->branch_qpu_ip == ~0) {
+                        assert(block->end_qpu_ip + 1 ==
+                               block->successors[0]->start_qpu_ip);
+                        continue;
+                }
+
+                /* Set the branch target for the block that doesn't follow
+                 * immediately after ours.
+                 */
+                uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
+                assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
+                assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
+
+                uint32_t branch_target =
+                        (block->successors[0]->start_qpu_ip -
+                         (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
+                *branch_inst = (*branch_inst |
+                                QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
+
+                /* Make sure that the if-we-don't-jump successor was scheduled
+                 * just after the delay slots.
+                 */
+                if (block->successors[1]) {
+                        assert(block->successors[1]->start_qpu_ip ==
+                               block->branch_qpu_ip + 4);
+                }
+        }
+}
+
 uint32_t
 qpu_schedule_instructions(struct vc4_compile *c)
 {
@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c)
        c->uniform_array_size = c->num_uniforms;
        uint32_t next_uniform = 0;

+        struct choose_scoreboard scoreboard;
+        memset(&scoreboard, 0, sizeof(scoreboard));
+        scoreboard.last_waddr_a = ~0;
+        scoreboard.last_waddr_b = ~0;
+        scoreboard.last_sfu_write_tick = -10;
+
        if (debug) {
                fprintf(stderr, "Pre-schedule instructions\n");
                qir_for_each_block(block, c) {
@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c)

        uint32_t cycles = 0;
        qir_for_each_block(block, c) {
-                cycles += qpu_schedule_instructions_block(c, block,
+                block->start_qpu_ip = c->qpu_inst_count;
+                block->branch_qpu_ip = ~0;
+
+                cycles += qpu_schedule_instructions_block(c,
+                                                          &scoreboard,
+                                                          block,
                                                          uniform_contents,
                                                          uniform_data,
                                                          &next_uniform);
+
+                block->end_qpu_ip = c->qpu_inst_count - 1;
        }

+        qpu_set_branch_targets(c);
+
        assert(next_uniform == c->num_uniforms);

        if (debug) {