vc4: Pair up QPU instructions when scheduling.

We've got two mostly-independent operations in each QPU instruction, so
try to pack two operations together.  This is fairly naive (doesn't track
read and write separately in instructions, doesn't convert ADD-based MOVs
into MUL-based movs, doesn't reorder across uniform loads), but does show
a decent improvement on shader-db-2.

total instructions in shared programs: 59583 -> 57651 (-3.24%)
instructions in affected programs:     47361 -> 45429 (-4.08%)
This commit is contained in:
Eric Anholt 2014-12-01 11:48:20 -08:00
parent 7b0067d23a
commit 29c7cf2b2b
3 changed files with 105 additions and 38 deletions

View file

@ -192,36 +192,58 @@ qpu_m_alu2(enum qpu_op_mul op,
return inst;
}
static uint64_t
merge_fields(uint64_t merge,
uint64_t add, uint64_t mul,
static bool
merge_fields(uint64_t *merge,
uint64_t a, uint64_t b,
uint64_t mask, uint64_t ignore)
{
if ((add & mask) == ignore)
return (merge & ~mask) | (mul & mask);
else if ((mul & mask) == ignore)
return (merge & ~mask) | (add & mask);
else {
assert((add & mask) == (mul & mask));
return merge;
if ((a & mask) == ignore) {
*merge = (*merge & ~mask) | (b & mask);
} else if ((b & mask) == ignore) {
*merge = (*merge & ~mask) | (a & mask);
} else {
if ((a & mask) != (b & mask))
return false;
}
return true;
}
uint64_t
qpu_inst(uint64_t add, uint64_t mul)
qpu_merge_inst(uint64_t a, uint64_t b)
{
uint64_t merge = ((add & ~QPU_WADDR_MUL_MASK) |
(mul & ~QPU_WADDR_ADD_MASK));
uint64_t merge = a | b;
bool ok = true;
merge = merge_fields(merge, add, mul, QPU_SIG_MASK,
QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP &&
QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP)
return 0;
merge = merge_fields(merge, add, mul, QPU_RADDR_A_MASK,
QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
merge = merge_fields(merge, add, mul, QPU_RADDR_B_MASK,
QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP &&
QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
return 0;
return merge;
ok = ok && merge_fields(&merge, a, b, QPU_SIG_MASK,
QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));
/* Misc fields that have to match exactly. */
ok = ok && merge_fields(&merge, a, b, QPU_SF | QPU_WS | QPU_PM,
~0);
ok = ok && merge_fields(&merge, a, b, QPU_RADDR_A_MASK,
QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A));
ok = ok && merge_fields(&merge, a, b, QPU_RADDR_B_MASK,
QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));
ok = ok && merge_fields(&merge, a, b, QPU_WADDR_ADD_MASK,
QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD));
ok = ok && merge_fields(&merge, a, b, QPU_WADDR_MUL_MASK,
QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL));
if (ok)
return merge;
else
return 0;
}
uint64_t

View file

@ -129,7 +129,7 @@ uint64_t qpu_a_alu2(enum qpu_op_add op, struct qpu_reg dst,
struct qpu_reg src0, struct qpu_reg src1);
uint64_t qpu_m_alu2(enum qpu_op_mul op, struct qpu_reg dst,
struct qpu_reg src0, struct qpu_reg src1);
uint64_t qpu_inst(uint64_t add, uint64_t mul);
uint64_t qpu_merge_inst(uint64_t a, uint64_t b);
uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val);
uint64_t qpu_set_sig(uint64_t inst, uint32_t sig);
uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond);

View file

@ -465,7 +465,8 @@ get_instruction_priority(uint64_t inst)
static struct schedule_node *
choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
struct simple_node *schedule_list)
struct simple_node *schedule_list,
uint64_t prev_inst)
{
struct schedule_node *chosen = NULL;
struct simple_node *node;
@ -490,6 +491,15 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
if (pixel_scoreboard_too_soon(scoreboard, inst))
continue;
/* If we're trying to pair with another instruction, check
* that they're compatible.
*/
if (prev_inst != 0) {
inst = qpu_merge_inst(prev_inst, inst);
if (!inst)
continue;
}
int prio = get_instruction_priority(inst);
/* Found a valid instruction. If nothing better comes along,
@ -570,6 +580,23 @@ compute_delay(struct schedule_node *n)
}
}
static void
mark_instruction_scheduled(struct simple_node *schedule_list,
struct schedule_node *node)
{
if (!node)
return;
for (int i = node->child_count - 1; i >= 0; i--) {
struct schedule_node *child =
node->children[i];
child->parent_count--;
if (child->parent_count == 0)
insert_at_head(schedule_list, &child->link);
}
}
static void
schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
{
@ -598,7 +625,9 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
while (!is_empty_list(schedule_list)) {
struct schedule_node *chosen =
choose_instruction_to_schedule(&scoreboard,
schedule_list);
schedule_list,
0);
struct schedule_node *merge = NULL;
/* If there are no valid instructions to schedule, drop a NOP
* in.
@ -610,12 +639,38 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
dump_state(schedule_list);
fprintf(stderr, "chose: ");
vc4_qpu_disasm(&inst, 1);
fprintf(stderr, "\n\n");
fprintf(stderr, "\n");
}
/* Schedule this instruction onto the QPU list. */
if (chosen)
/* Schedule this instruction onto the QPU list. Also try to
* find an instruction to pair with it.
*/
if (chosen) {
remove_from_list(&chosen->link);
merge = choose_instruction_to_schedule(&scoreboard,
schedule_list,
inst);
if (merge) {
remove_from_list(&merge->link);
inst = qpu_merge_inst(inst, merge->inst->inst);
assert(inst != 0);
if (debug) {
fprintf(stderr, "merging: ");
vc4_qpu_disasm(&merge->inst->inst, 1);
fprintf(stderr, "\n");
fprintf(stderr, "resulting in: ");
vc4_qpu_disasm(&inst, 1);
fprintf(stderr, "\n");
}
}
}
if (debug) {
fprintf(stderr, "\n");
}
qpu_serialize_one_inst(c, inst);
update_scoreboard_for_chosen(&scoreboard, inst);
@ -625,18 +680,8 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
* be scheduled. Update the children's unblocked time for this
* DAG edge as we do so.
*/
if (chosen) {
for (int i = chosen->child_count - 1; i >= 0; i--) {
struct schedule_node *child =
chosen->children[i];
child->parent_count--;
if (child->parent_count == 0) {
insert_at_head(schedule_list,
&child->link);
}
}
}
mark_instruction_scheduled(schedule_list, chosen);
mark_instruction_scheduled(schedule_list, merge);
scoreboard.tick++;
}