vc4: Add support for scheduling of branch instructions.

For now we don't fill the delay slots, and instead just drop in NOPs.
This commit is contained in:
Eric Anholt 2016-04-27 12:14:07 -07:00
parent a59da513d3
commit 44df061aaa
2 changed files with 114 additions and 17 deletions

View file

@ -362,6 +362,17 @@ struct qblock {
int index;
/* Instruction IPs for the first and last instruction of the block.
* Set by vc4_qpu_schedule.c.
*/
uint32_t start_qpu_ip;
uint32_t end_qpu_ip;
/* Instruction IP for the branch instruction of the block. Set by
* vc4_qpu_schedule.c.
*/
uint32_t branch_qpu_ip;
/** @{ used by vc4_qir_live_variables.c */
BITSET_WORD *def;
BITSET_WORD *use;

View file

@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
if (sig != QPU_SIG_LOAD_IMM) {
process_raddr_deps(state, n, raddr_a, true);
if (sig != QPU_SIG_SMALL_IMM)
if (sig != QPU_SIG_SMALL_IMM &&
sig != QPU_SIG_BRANCH)
process_raddr_deps(state, n, raddr_b, false);
}
@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_read_dep(state, state->last_tlb, n);
break;
case QPU_SIG_BRANCH:
add_read_dep(state, state->last_sf, n);
break;
case QPU_SIG_PROG_END:
case QPU_SIG_WAIT_FOR_SCOREBOARD:
case QPU_SIG_SCOREBOARD_UNLOCK:
case QPU_SIG_COVERAGE_LOAD:
case QPU_SIG_COLOR_LOAD_END:
case QPU_SIG_ALPHA_MASK_LOAD:
case QPU_SIG_BRANCH:
fprintf(stderr, "Unhandled signal bits %d\n", sig);
abort();
}
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
if (inst & QPU_SF)
if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
add_write_dep(state, &state->last_sf, n);
}
@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
list_for_each_entry(struct schedule_node, n, schedule_list, link) {
uint64_t inst = n->inst->inst;
/* Don't choose the branch instruction until it's the last one
* left. XXX: We could potentially choose it before it's the
* last one, if the remaining instructions fit in the delay
* slots.
*/
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
!list_is_singular(schedule_list)) {
continue;
}
/* "An instruction must not read from a location in physical
* regfile A or B that was written to by the previous
* instruction."
@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list,
}
static uint32_t
schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
schedule_instructions(struct vc4_compile *c,
struct choose_scoreboard *scoreboard,
struct qblock *block,
struct list_head *schedule_list,
enum quniform_contents *orig_uniform_contents,
uint32_t *orig_uniform_data,
uint32_t *next_uniform)
{
struct choose_scoreboard scoreboard;
uint32_t time = 0;
memset(&scoreboard, 0, sizeof(scoreboard));
scoreboard.last_waddr_a = ~0;
scoreboard.last_waddr_b = ~0;
scoreboard.last_sfu_write_tick = -10;
if (debug) {
fprintf(stderr, "initial deps:\n");
dump_state(schedule_list);
@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
while (!list_empty(schedule_list)) {
struct schedule_node *chosen =
choose_instruction_to_schedule(&scoreboard,
choose_instruction_to_schedule(scoreboard,
schedule_list,
NULL);
struct schedule_node *merge = NULL;
@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
(*next_uniform)++;
}
merge = choose_instruction_to_schedule(&scoreboard,
merge = choose_instruction_to_schedule(scoreboard,
schedule_list,
chosen);
if (merge) {
@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
qpu_serialize_one_inst(c, inst);
update_scoreboard_for_chosen(&scoreboard, inst);
update_scoreboard_for_chosen(scoreboard, inst);
/* Now that we've scheduled a new instruction, some of its
* children can be promoted to the list of instructions ready to
@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
mark_instruction_scheduled(schedule_list, time, chosen, false);
mark_instruction_scheduled(schedule_list, time, merge, false);
scoreboard.tick++;
scoreboard->tick++;
time++;
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
block->branch_qpu_ip = c->qpu_inst_count - 1;
/* Fill the delay slots.
*
* We should fill these with actual instructions,
* instead, but that will probably need to be done
* after this, once we know what the leading
* instructions of the successors are (so we can
* handle A/B register file write latency)
*/
inst = qpu_NOP();
update_scoreboard_for_chosen(scoreboard, inst);
qpu_serialize_one_inst(c, inst);
qpu_serialize_one_inst(c, inst);
qpu_serialize_one_inst(c, inst);
}
}
return time;
}
static uint32_t
qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
qpu_schedule_instructions_block(struct vc4_compile *c,
struct choose_scoreboard *scoreboard,
struct qblock *block,
enum quniform_contents *orig_uniform_contents,
uint32_t *orig_uniform_data,
uint32_t *next_uniform)
@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
compute_delay(n);
}
uint32_t cycles = schedule_instructions(c, &schedule_list,
uint32_t cycles = schedule_instructions(c, scoreboard, block,
&schedule_list,
orig_uniform_contents,
orig_uniform_data,
next_uniform);
@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
return cycles;
}
static void
qpu_set_branch_targets(struct vc4_compile *c)
{
qir_for_each_block(block, c) {
/* The end block of the program has no branch. */
if (!block->successors[0])
continue;
/* If there was no branch instruction, then the successor
* block must follow immediately after this one.
*/
if (block->branch_qpu_ip == ~0) {
assert(block->end_qpu_ip + 1 ==
block->successors[0]->start_qpu_ip);
continue;
}
/* Set the branch target for the block that doesn't follow
* immediately after ours.
*/
uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
uint32_t branch_target =
(block->successors[0]->start_qpu_ip -
(block->branch_qpu_ip + 4)) * sizeof(uint64_t);
*branch_inst = (*branch_inst |
QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
/* Make sure that the if-we-don't-jump successor was scheduled
* just after the delay slots.
*/
if (block->successors[1]) {
assert(block->successors[1]->start_qpu_ip ==
block->branch_qpu_ip + 4);
}
}
}
uint32_t
qpu_schedule_instructions(struct vc4_compile *c)
{
@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c)
c->uniform_array_size = c->num_uniforms;
uint32_t next_uniform = 0;
struct choose_scoreboard scoreboard;
memset(&scoreboard, 0, sizeof(scoreboard));
scoreboard.last_waddr_a = ~0;
scoreboard.last_waddr_b = ~0;
scoreboard.last_sfu_write_tick = -10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
qir_for_each_block(block, c) {
@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c)
uint32_t cycles = 0;
qir_for_each_block(block, c) {
cycles += qpu_schedule_instructions_block(c, block,
block->start_qpu_ip = c->qpu_inst_count;
block->branch_qpu_ip = ~0;
cycles += qpu_schedule_instructions_block(c,
&scoreboard,
block,
uniform_contents,
uniform_data,
&next_uniform);
block->end_qpu_ip = c->qpu_inst_count - 1;
}
qpu_set_branch_targets(c);
assert(next_uniform == c->num_uniforms);
if (debug) {