mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-04 02:40:11 +01:00
vc4: Add support for scheduling of branch instructions.
For now we don't fill the delay slots, and instead just drop in NOPs.
This commit is contained in:
parent
a59da513d3
commit
44df061aaa
2 changed files with 114 additions and 17 deletions
|
|
@ -362,6 +362,17 @@ struct qblock {
|
|||
|
||||
int index;
|
||||
|
||||
/* Instruction IPs for the first and last instruction of the block.
|
||||
* Set by vc4_qpu_schedule.c.
|
||||
*/
|
||||
uint32_t start_qpu_ip;
|
||||
uint32_t end_qpu_ip;
|
||||
|
||||
/* Instruction IP for the branch instruction of the block. Set by
|
||||
* vc4_qpu_schedule.c.
|
||||
*/
|
||||
uint32_t branch_qpu_ip;
|
||||
|
||||
/** @{ used by vc4_qir_live_variables.c */
|
||||
BITSET_WORD *def;
|
||||
BITSET_WORD *use;
|
||||
|
|
|
|||
|
|
@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
|||
|
||||
if (sig != QPU_SIG_LOAD_IMM) {
|
||||
process_raddr_deps(state, n, raddr_a, true);
|
||||
if (sig != QPU_SIG_SMALL_IMM)
|
||||
if (sig != QPU_SIG_SMALL_IMM &&
|
||||
sig != QPU_SIG_BRANCH)
|
||||
process_raddr_deps(state, n, raddr_b, false);
|
||||
}
|
||||
|
||||
|
|
@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
|||
add_read_dep(state, state->last_tlb, n);
|
||||
break;
|
||||
|
||||
case QPU_SIG_BRANCH:
|
||||
add_read_dep(state, state->last_sf, n);
|
||||
break;
|
||||
|
||||
case QPU_SIG_PROG_END:
|
||||
case QPU_SIG_WAIT_FOR_SCOREBOARD:
|
||||
case QPU_SIG_SCOREBOARD_UNLOCK:
|
||||
case QPU_SIG_COVERAGE_LOAD:
|
||||
case QPU_SIG_COLOR_LOAD_END:
|
||||
case QPU_SIG_ALPHA_MASK_LOAD:
|
||||
case QPU_SIG_BRANCH:
|
||||
fprintf(stderr, "Unhandled signal bits %d\n", sig);
|
||||
abort();
|
||||
}
|
||||
|
||||
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
|
||||
process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
|
||||
if (inst & QPU_SF)
|
||||
if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
|
||||
add_write_dep(state, &state->last_sf, n);
|
||||
}
|
||||
|
||||
|
|
@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
|
|||
list_for_each_entry(struct schedule_node, n, schedule_list, link) {
|
||||
uint64_t inst = n->inst->inst;
|
||||
|
||||
/* Don't choose the branch instruction until it's the last one
|
||||
* left. XXX: We could potentially choose it before it's the
|
||||
* last one, if the remaining instructions fit in the delay
|
||||
* slots.
|
||||
*/
|
||||
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
|
||||
!list_is_singular(schedule_list)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* "An instruction must not read from a location in physical
|
||||
* regfile A or B that was written to by the previous
|
||||
* instruction."
|
||||
|
|
@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list,
|
|||
}
|
||||
|
||||
static uint32_t
|
||||
schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
|
||||
schedule_instructions(struct vc4_compile *c,
|
||||
struct choose_scoreboard *scoreboard,
|
||||
struct qblock *block,
|
||||
struct list_head *schedule_list,
|
||||
enum quniform_contents *orig_uniform_contents,
|
||||
uint32_t *orig_uniform_data,
|
||||
uint32_t *next_uniform)
|
||||
{
|
||||
struct choose_scoreboard scoreboard;
|
||||
uint32_t time = 0;
|
||||
|
||||
memset(&scoreboard, 0, sizeof(scoreboard));
|
||||
scoreboard.last_waddr_a = ~0;
|
||||
scoreboard.last_waddr_b = ~0;
|
||||
scoreboard.last_sfu_write_tick = -10;
|
||||
|
||||
if (debug) {
|
||||
fprintf(stderr, "initial deps:\n");
|
||||
dump_state(schedule_list);
|
||||
|
|
@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
|
|||
|
||||
while (!list_empty(schedule_list)) {
|
||||
struct schedule_node *chosen =
|
||||
choose_instruction_to_schedule(&scoreboard,
|
||||
choose_instruction_to_schedule(scoreboard,
|
||||
schedule_list,
|
||||
NULL);
|
||||
struct schedule_node *merge = NULL;
|
||||
|
|
@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
|
|||
(*next_uniform)++;
|
||||
}
|
||||
|
||||
merge = choose_instruction_to_schedule(&scoreboard,
|
||||
merge = choose_instruction_to_schedule(scoreboard,
|
||||
schedule_list,
|
||||
chosen);
|
||||
if (merge) {
|
||||
|
|
@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
|
|||
|
||||
qpu_serialize_one_inst(c, inst);
|
||||
|
||||
update_scoreboard_for_chosen(&scoreboard, inst);
|
||||
update_scoreboard_for_chosen(scoreboard, inst);
|
||||
|
||||
/* Now that we've scheduled a new instruction, some of its
|
||||
* children can be promoted to the list of instructions ready to
|
||||
|
|
@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list,
|
|||
mark_instruction_scheduled(schedule_list, time, chosen, false);
|
||||
mark_instruction_scheduled(schedule_list, time, merge, false);
|
||||
|
||||
scoreboard.tick++;
|
||||
scoreboard->tick++;
|
||||
time++;
|
||||
|
||||
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
|
||||
block->branch_qpu_ip = c->qpu_inst_count - 1;
|
||||
/* Fill the delay slots.
|
||||
*
|
||||
* We should fill these with actual instructions,
|
||||
* instead, but that will probably need to be done
|
||||
* after this, once we know what the leading
|
||||
* instructions of the successors are (so we can
|
||||
* handle A/B register file write latency)
|
||||
*/
|
||||
inst = qpu_NOP();
|
||||
update_scoreboard_for_chosen(scoreboard, inst);
|
||||
qpu_serialize_one_inst(c, inst);
|
||||
qpu_serialize_one_inst(c, inst);
|
||||
qpu_serialize_one_inst(c, inst);
|
||||
}
|
||||
}
|
||||
|
||||
return time;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
|
||||
qpu_schedule_instructions_block(struct vc4_compile *c,
|
||||
struct choose_scoreboard *scoreboard,
|
||||
struct qblock *block,
|
||||
enum quniform_contents *orig_uniform_contents,
|
||||
uint32_t *orig_uniform_data,
|
||||
uint32_t *next_uniform)
|
||||
|
|
@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
|
|||
compute_delay(n);
|
||||
}
|
||||
|
||||
uint32_t cycles = schedule_instructions(c, &schedule_list,
|
||||
uint32_t cycles = schedule_instructions(c, scoreboard, block,
|
||||
&schedule_list,
|
||||
orig_uniform_contents,
|
||||
orig_uniform_data,
|
||||
next_uniform);
|
||||
|
|
@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block,
|
|||
return cycles;
|
||||
}
|
||||
|
||||
static void
|
||||
qpu_set_branch_targets(struct vc4_compile *c)
|
||||
{
|
||||
qir_for_each_block(block, c) {
|
||||
/* The end block of the program has no branch. */
|
||||
if (!block->successors[0])
|
||||
continue;
|
||||
|
||||
/* If there was no branch instruction, then the successor
|
||||
* block must follow immediately after this one.
|
||||
*/
|
||||
if (block->branch_qpu_ip == ~0) {
|
||||
assert(block->end_qpu_ip + 1 ==
|
||||
block->successors[0]->start_qpu_ip);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Set the branch target for the block that doesn't follow
|
||||
* immediately after ours.
|
||||
*/
|
||||
uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
|
||||
assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
|
||||
assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
|
||||
|
||||
uint32_t branch_target =
|
||||
(block->successors[0]->start_qpu_ip -
|
||||
(block->branch_qpu_ip + 4)) * sizeof(uint64_t);
|
||||
*branch_inst = (*branch_inst |
|
||||
QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
|
||||
|
||||
/* Make sure that the if-we-don't-jump successor was scheduled
|
||||
* just after the delay slots.
|
||||
*/
|
||||
if (block->successors[1]) {
|
||||
assert(block->successors[1]->start_qpu_ip ==
|
||||
block->branch_qpu_ip + 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
qpu_schedule_instructions(struct vc4_compile *c)
|
||||
{
|
||||
|
|
@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c)
|
|||
c->uniform_array_size = c->num_uniforms;
|
||||
uint32_t next_uniform = 0;
|
||||
|
||||
struct choose_scoreboard scoreboard;
|
||||
memset(&scoreboard, 0, sizeof(scoreboard));
|
||||
scoreboard.last_waddr_a = ~0;
|
||||
scoreboard.last_waddr_b = ~0;
|
||||
scoreboard.last_sfu_write_tick = -10;
|
||||
|
||||
if (debug) {
|
||||
fprintf(stderr, "Pre-schedule instructions\n");
|
||||
qir_for_each_block(block, c) {
|
||||
|
|
@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c)
|
|||
|
||||
uint32_t cycles = 0;
|
||||
qir_for_each_block(block, c) {
|
||||
cycles += qpu_schedule_instructions_block(c, block,
|
||||
block->start_qpu_ip = c->qpu_inst_count;
|
||||
block->branch_qpu_ip = ~0;
|
||||
|
||||
cycles += qpu_schedule_instructions_block(c,
|
||||
&scoreboard,
|
||||
block,
|
||||
uniform_contents,
|
||||
uniform_data,
|
||||
&next_uniform);
|
||||
|
||||
block->end_qpu_ip = c->qpu_inst_count - 1;
|
||||
}
|
||||
|
||||
qpu_set_branch_targets(c);
|
||||
|
||||
assert(next_uniform == c->num_uniforms);
|
||||
|
||||
if (debug) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue