vc4: Fix latency handling for QPU texture scheduling.

There's only high latency between a complete texture fetch setup and
collecting its result, not between each step of setting up the texture
fetch request.
This commit is contained in:
Eric Anholt 2015-12-18 11:30:30 -08:00
parent 960f48809f
commit 5278c64de5

View file

@ -50,7 +50,7 @@ struct schedule_node {
uint32_t child_array_size;
uint32_t parent_count;
/* Longest cycles + n->latency of any parent of this node. */
/* Longest cycles + instruction_latency() of any parent of this node. */
uint32_t unblocked_time;
/**
@ -624,6 +624,46 @@ dump_state(struct list_head *schedule_list)
}
}
static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
{
if (waddr < 32)
return 2;
/* Apply some huge latency between texture fetch requests and getting
* their results back.
*/
if (waddr == QPU_W_TMU0_S) {
if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
return 100;
}
if (waddr == QPU_W_TMU1_S) {
if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
return 100;
}
switch(waddr) {
case QPU_W_SFU_RECIP:
case QPU_W_SFU_RECIPSQRT:
case QPU_W_SFU_EXP:
case QPU_W_SFU_LOG:
return 3;
default:
return 1;
}
}
static uint32_t
instruction_latency(struct schedule_node *before, struct schedule_node *after)
{
uint64_t before_inst = before->inst->inst;
uint64_t after_inst = after->inst->inst;
return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
after_inst),
waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
after_inst));
}
/** Recursive computation of the delay member of a node. */
static void
compute_delay(struct schedule_node *n)
@ -635,7 +675,8 @@ compute_delay(struct schedule_node *n)
if (!n->children[i].node->delay)
compute_delay(n->children[i].node);
n->delay = MAX2(n->delay,
n->children[i].node->delay + n->latency);
n->children[i].node->delay +
instruction_latency(n, n->children[i].node));
}
}
}
@ -664,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
* immediately after (or paired with!) the thing reading the
* destination.
*/
int latency_from_previous = war_only ? 0 : node->latency;
uint32_t latency = 0;
if (!war_only) {
latency = instruction_latency(node,
node->children[i].node);
}
child->unblocked_time = MAX2(child->unblocked_time,
time + latency_from_previous);
time + latency);
child->parent_count--;
if (child->parent_count == 0)
list_add(&child->link, schedule_list);
@ -799,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
return time;
}
static uint32_t waddr_latency(uint32_t waddr)
{
if (waddr < 32)
return 2;
/* Some huge number, really. */
if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
return 100;
switch(waddr) {
case QPU_W_SFU_RECIP:
case QPU_W_SFU_RECIPSQRT:
case QPU_W_SFU_EXP:
case QPU_W_SFU_LOG:
return 3;
default:
return 1;
}
}
static uint32_t
instruction_latency(uint64_t inst)
{
return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
}
uint32_t
qpu_schedule_instructions(struct vc4_compile *c)
{
@ -852,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
n->inst = inst;
n->latency = instruction_latency(inst->inst);
if (reads_uniform(inst->inst)) {
n->uniform = next_uniform++;