mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-18 22:28:06 +02:00
Based on empirical testing with Sponza and a few UE4 samples this is consistently slightly benefitial for performance. The most likely reason why this helps is that thrsw is probably already quite effective at hiding latency and we are already trying to hide latency at NIR scheduling and also via TMU pipelining, so piling up on this when scheduling QPU typically ends up providing no benefit at all for latency and is instead possibly preventing us to unblock critical paths in the shader that depend on the TMU result, requiring us to execute more cycles to complete the program. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17451>
2507 lines
93 KiB
C
2507 lines
93 KiB
C
/*
|
|
* Copyright © 2010 Intel Corporation
|
|
* Copyright © 2014-2017 Broadcom
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
*
|
|
* The basic model of the list scheduler is to take a basic block, compute a
|
|
* DAG of the dependencies, and make a list of the DAG heads. Heuristically
|
|
* pick a DAG head, then put all the children that are now DAG heads into the
|
|
* list of things to schedule.
|
|
*
|
|
* The goal of scheduling here is to pack pairs of operations together in a
|
|
* single QPU instruction.
|
|
*/
|
|
|
|
#include "qpu/qpu_disasm.h"
|
|
#include "v3d_compiler.h"
|
|
#include "util/ralloc.h"
|
|
#include "util/dag.h"
|
|
|
|
static bool debug;
|
|
|
|
struct schedule_node_child;
|
|
|
|
struct schedule_node {
|
|
struct dag_node dag;
|
|
struct list_head link;
|
|
struct qinst *inst;
|
|
|
|
/* Longest cycles + instruction_latency() of any parent of this node. */
|
|
uint32_t unblocked_time;
|
|
|
|
/**
|
|
* Minimum number of cycles from scheduling this instruction until the
|
|
* end of the program, based on the slowest dependency chain through
|
|
* the children.
|
|
*/
|
|
uint32_t delay;
|
|
|
|
/**
|
|
* cycles between this instruction being scheduled and when its result
|
|
* can be consumed.
|
|
*/
|
|
uint32_t latency;
|
|
};
|
|
|
|
/* When walking the instructions in reverse, we need to swap before/after in
|
|
* add_dep().
|
|
*/
|
|
enum direction { F, R };
|
|
|
|
struct schedule_state {
|
|
const struct v3d_device_info *devinfo;
|
|
struct dag *dag;
|
|
struct schedule_node *last_r[6];
|
|
struct schedule_node *last_rf[64];
|
|
struct schedule_node *last_sf;
|
|
struct schedule_node *last_vpm_read;
|
|
struct schedule_node *last_tmu_write;
|
|
struct schedule_node *last_tmu_config;
|
|
struct schedule_node *last_tmu_read;
|
|
struct schedule_node *last_tlb;
|
|
struct schedule_node *last_vpm;
|
|
struct schedule_node *last_unif;
|
|
struct schedule_node *last_rtop;
|
|
struct schedule_node *last_unifa;
|
|
enum direction dir;
|
|
/* Estimated cycle when the current instruction would start. */
|
|
uint32_t time;
|
|
};
|
|
|
|
static void
|
|
add_dep(struct schedule_state *state,
|
|
struct schedule_node *before,
|
|
struct schedule_node *after,
|
|
bool write)
|
|
{
|
|
bool write_after_read = !write && state->dir == R;
|
|
uintptr_t edge_data = write_after_read;
|
|
|
|
if (!before || !after)
|
|
return;
|
|
|
|
assert(before != after);
|
|
|
|
if (state->dir == F)
|
|
dag_add_edge(&before->dag, &after->dag, edge_data);
|
|
else
|
|
dag_add_edge(&after->dag, &before->dag, edge_data);
|
|
}
|
|
|
|
static void
|
|
add_read_dep(struct schedule_state *state,
|
|
struct schedule_node *before,
|
|
struct schedule_node *after)
|
|
{
|
|
add_dep(state, before, after, false);
|
|
}
|
|
|
|
static void
|
|
add_write_dep(struct schedule_state *state,
|
|
struct schedule_node **before,
|
|
struct schedule_node *after)
|
|
{
|
|
add_dep(state, *before, after, true);
|
|
*before = after;
|
|
}
|
|
|
|
static bool
|
|
qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
|
|
{
|
|
if (inst->sig.ldtlb || inst->sig.ldtlbu)
|
|
return true;
|
|
|
|
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
|
return false;
|
|
|
|
if (inst->alu.add.magic_write &&
|
|
(inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
|
|
inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
|
|
return true;
|
|
|
|
if (inst->alu.mul.magic_write &&
|
|
(inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
|
|
inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
|
|
enum v3d_qpu_mux mux)
|
|
{
|
|
switch (mux) {
|
|
case V3D_QPU_MUX_A:
|
|
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
|
|
break;
|
|
case V3D_QPU_MUX_B:
|
|
if (!n->inst->qpu.sig.small_imm) {
|
|
add_read_dep(state,
|
|
state->last_rf[n->inst->qpu.raddr_b], n);
|
|
}
|
|
break;
|
|
default:
|
|
add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
tmu_write_is_sequence_terminator(uint32_t waddr)
|
|
{
|
|
switch (waddr) {
|
|
case V3D_QPU_WADDR_TMUS:
|
|
case V3D_QPU_WADDR_TMUSCM:
|
|
case V3D_QPU_WADDR_TMUSF:
|
|
case V3D_QPU_WADDR_TMUSLOD:
|
|
case V3D_QPU_WADDR_TMUA:
|
|
case V3D_QPU_WADDR_TMUAU:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
|
|
{
|
|
if (devinfo->ver < 40)
|
|
return false;
|
|
|
|
if (tmu_write_is_sequence_terminator(waddr))
|
|
return false;
|
|
|
|
if (waddr == V3D_QPU_WADDR_TMUD)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
|
|
uint32_t waddr, bool magic)
|
|
{
|
|
if (!magic) {
|
|
add_write_dep(state, &state->last_rf[waddr], n);
|
|
} else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
|
|
if (can_reorder_tmu_write(state->devinfo, waddr))
|
|
add_read_dep(state, state->last_tmu_write, n);
|
|
else
|
|
add_write_dep(state, &state->last_tmu_write, n);
|
|
|
|
if (tmu_write_is_sequence_terminator(waddr))
|
|
add_write_dep(state, &state->last_tmu_config, n);
|
|
} else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
|
|
/* Handled by v3d_qpu_writes_r4() check. */
|
|
} else {
|
|
switch (waddr) {
|
|
case V3D_QPU_WADDR_R0:
|
|
case V3D_QPU_WADDR_R1:
|
|
case V3D_QPU_WADDR_R2:
|
|
add_write_dep(state,
|
|
&state->last_r[waddr - V3D_QPU_WADDR_R0],
|
|
n);
|
|
break;
|
|
case V3D_QPU_WADDR_R3:
|
|
case V3D_QPU_WADDR_R4:
|
|
case V3D_QPU_WADDR_R5:
|
|
/* Handled by v3d_qpu_writes_r*() checks below. */
|
|
break;
|
|
|
|
case V3D_QPU_WADDR_VPM:
|
|
case V3D_QPU_WADDR_VPMU:
|
|
add_write_dep(state, &state->last_vpm, n);
|
|
break;
|
|
|
|
case V3D_QPU_WADDR_TLB:
|
|
case V3D_QPU_WADDR_TLBU:
|
|
add_write_dep(state, &state->last_tlb, n);
|
|
break;
|
|
|
|
case V3D_QPU_WADDR_SYNC:
|
|
case V3D_QPU_WADDR_SYNCB:
|
|
case V3D_QPU_WADDR_SYNCU:
|
|
/* For CS barrier(): Sync against any other memory
|
|
* accesses. There doesn't appear to be any need for
|
|
* barriers to affect ALU operations.
|
|
*/
|
|
add_write_dep(state, &state->last_tmu_write, n);
|
|
add_write_dep(state, &state->last_tmu_read, n);
|
|
break;
|
|
|
|
case V3D_QPU_WADDR_UNIFA:
|
|
if (state->devinfo->ver >= 40)
|
|
add_write_dep(state, &state->last_unifa, n);
|
|
break;
|
|
|
|
case V3D_QPU_WADDR_NOP:
|
|
break;
|
|
|
|
default:
|
|
fprintf(stderr, "Unknown waddr %d\n", waddr);
|
|
abort();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Common code for dependencies that need to be tracked both forward and
|
|
* backward.
|
|
*
|
|
* This is for things like "all reads of r4 have to happen between the r4
|
|
* writes that surround them".
|
|
*/
|
|
static void
|
|
calculate_deps(struct schedule_state *state, struct schedule_node *n)
|
|
{
|
|
const struct v3d_device_info *devinfo = state->devinfo;
|
|
struct qinst *qinst = n->inst;
|
|
struct v3d_qpu_instr *inst = &qinst->qpu;
|
|
/* If the input and output segments are shared, then all VPM reads to
|
|
* a location need to happen before all writes. We handle this by
|
|
* serializing all VPM operations for now.
|
|
*/
|
|
bool separate_vpm_segment = false;
|
|
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
|
if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
|
|
add_read_dep(state, state->last_sf, n);
|
|
|
|
/* XXX: BDI */
|
|
/* XXX: BDU */
|
|
/* XXX: ub */
|
|
/* XXX: raddr_a */
|
|
|
|
add_write_dep(state, &state->last_unif, n);
|
|
return;
|
|
}
|
|
|
|
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
|
|
/* XXX: LOAD_IMM */
|
|
|
|
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
|
|
process_mux_deps(state, n, inst->alu.add.a);
|
|
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
|
|
process_mux_deps(state, n, inst->alu.add.b);
|
|
|
|
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
|
|
process_mux_deps(state, n, inst->alu.mul.a);
|
|
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
|
|
process_mux_deps(state, n, inst->alu.mul.b);
|
|
|
|
switch (inst->alu.add.op) {
|
|
case V3D_QPU_A_VPMSETUP:
|
|
/* Could distinguish read/write by unpacking the uniform. */
|
|
add_write_dep(state, &state->last_vpm, n);
|
|
add_write_dep(state, &state->last_vpm_read, n);
|
|
break;
|
|
|
|
case V3D_QPU_A_STVPMV:
|
|
case V3D_QPU_A_STVPMD:
|
|
case V3D_QPU_A_STVPMP:
|
|
add_write_dep(state, &state->last_vpm, n);
|
|
break;
|
|
|
|
case V3D_QPU_A_LDVPMV_IN:
|
|
case V3D_QPU_A_LDVPMD_IN:
|
|
case V3D_QPU_A_LDVPMG_IN:
|
|
case V3D_QPU_A_LDVPMP:
|
|
if (!separate_vpm_segment)
|
|
add_write_dep(state, &state->last_vpm, n);
|
|
break;
|
|
|
|
case V3D_QPU_A_VPMWT:
|
|
add_read_dep(state, state->last_vpm, n);
|
|
break;
|
|
|
|
case V3D_QPU_A_MSF:
|
|
add_read_dep(state, state->last_tlb, n);
|
|
break;
|
|
|
|
case V3D_QPU_A_SETMSF:
|
|
case V3D_QPU_A_SETREVF:
|
|
add_write_dep(state, &state->last_tlb, n);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (inst->alu.mul.op) {
|
|
case V3D_QPU_M_MULTOP:
|
|
case V3D_QPU_M_UMUL24:
|
|
/* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
|
|
* resets it to 0. We could possibly reorder umul24s relative
|
|
* to each other, but for now just keep all the MUL parts in
|
|
* order.
|
|
*/
|
|
add_write_dep(state, &state->last_rtop, n);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (inst->alu.add.op != V3D_QPU_A_NOP) {
|
|
process_waddr_deps(state, n, inst->alu.add.waddr,
|
|
inst->alu.add.magic_write);
|
|
}
|
|
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
|
|
process_waddr_deps(state, n, inst->alu.mul.waddr,
|
|
inst->alu.mul.magic_write);
|
|
}
|
|
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
|
|
process_waddr_deps(state, n, inst->sig_addr,
|
|
inst->sig_magic);
|
|
}
|
|
|
|
if (v3d_qpu_writes_r3(devinfo, inst))
|
|
add_write_dep(state, &state->last_r[3], n);
|
|
if (v3d_qpu_writes_r4(devinfo, inst))
|
|
add_write_dep(state, &state->last_r[4], n);
|
|
if (v3d_qpu_writes_r5(devinfo, inst))
|
|
add_write_dep(state, &state->last_r[5], n);
|
|
|
|
/* If we add any more dependencies here we should consider whether we
|
|
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
|
|
*/
|
|
if (inst->sig.thrsw) {
|
|
/* All accumulator contents and flags are undefined after the
|
|
* switch.
|
|
*/
|
|
for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
|
|
add_write_dep(state, &state->last_r[i], n);
|
|
add_write_dep(state, &state->last_sf, n);
|
|
add_write_dep(state, &state->last_rtop, n);
|
|
|
|
/* Scoreboard-locking operations have to stay after the last
|
|
* thread switch.
|
|
*/
|
|
add_write_dep(state, &state->last_tlb, n);
|
|
|
|
add_write_dep(state, &state->last_tmu_write, n);
|
|
add_write_dep(state, &state->last_tmu_config, n);
|
|
}
|
|
|
|
if (v3d_qpu_waits_on_tmu(inst)) {
|
|
/* TMU loads are coming from a FIFO, so ordering is important.
|
|
*/
|
|
add_write_dep(state, &state->last_tmu_read, n);
|
|
/* Keep TMU loads after their TMU lookup terminator */
|
|
add_read_dep(state, state->last_tmu_config, n);
|
|
}
|
|
|
|
/* Allow wrtmuc to be reordered with other instructions in the
|
|
* same TMU sequence by using a read dependency on the last TMU
|
|
* sequence terminator.
|
|
*/
|
|
if (inst->sig.wrtmuc)
|
|
add_read_dep(state, state->last_tmu_config, n);
|
|
|
|
if (inst->sig.ldtlb | inst->sig.ldtlbu)
|
|
add_write_dep(state, &state->last_tlb, n);
|
|
|
|
if (inst->sig.ldvpm) {
|
|
add_write_dep(state, &state->last_vpm_read, n);
|
|
|
|
/* At least for now, we're doing shared I/O segments, so queue
|
|
* all writes after all reads.
|
|
*/
|
|
if (!separate_vpm_segment)
|
|
add_write_dep(state, &state->last_vpm, n);
|
|
}
|
|
|
|
/* inst->sig.ldunif or sideband uniform read */
|
|
if (vir_has_uniform(qinst))
|
|
add_write_dep(state, &state->last_unif, n);
|
|
|
|
/* Both unifa and ldunifa must preserve ordering */
|
|
if (inst->sig.ldunifa || inst->sig.ldunifarf)
|
|
add_write_dep(state, &state->last_unifa, n);
|
|
|
|
if (v3d_qpu_reads_flags(inst))
|
|
add_read_dep(state, state->last_sf, n);
|
|
if (v3d_qpu_writes_flags(inst))
|
|
add_write_dep(state, &state->last_sf, n);
|
|
}
|
|
|
|
static void
|
|
calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
|
|
struct list_head *schedule_list)
|
|
{
|
|
struct schedule_state state;
|
|
|
|
memset(&state, 0, sizeof(state));
|
|
state.dag = dag;
|
|
state.devinfo = c->devinfo;
|
|
state.dir = F;
|
|
|
|
list_for_each_entry(struct schedule_node, node, schedule_list, link)
|
|
calculate_deps(&state, node);
|
|
}
|
|
|
|
static void
|
|
calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
|
|
struct list_head *schedule_list)
|
|
{
|
|
struct schedule_state state;
|
|
|
|
memset(&state, 0, sizeof(state));
|
|
state.dag = dag;
|
|
state.devinfo = c->devinfo;
|
|
state.dir = R;
|
|
|
|
list_for_each_entry_rev(struct schedule_node, node, schedule_list,
|
|
link) {
|
|
calculate_deps(&state, (struct schedule_node *)node);
|
|
}
|
|
}
|
|
|
|
struct choose_scoreboard {
|
|
struct dag *dag;
|
|
int tick;
|
|
int last_magic_sfu_write_tick;
|
|
int last_stallable_sfu_reg;
|
|
int last_stallable_sfu_tick;
|
|
int last_ldvary_tick;
|
|
int last_unifa_write_tick;
|
|
int last_uniforms_reset_tick;
|
|
int last_thrsw_tick;
|
|
int last_branch_tick;
|
|
int last_setmsf_tick;
|
|
bool first_thrsw_emitted;
|
|
bool last_thrsw_emitted;
|
|
bool fixup_ldvary;
|
|
int ldvary_count;
|
|
};
|
|
|
|
static bool
|
|
mux_reads_too_soon(struct choose_scoreboard *scoreboard,
|
|
const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
|
|
{
|
|
switch (mux) {
|
|
case V3D_QPU_MUX_R4:
|
|
if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
|
|
return true;
|
|
break;
|
|
|
|
case V3D_QPU_MUX_R5:
|
|
if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
|
|
return true;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
|
|
struct qinst *qinst)
|
|
{
|
|
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
|
|
|
/* XXX: Branching off of raddr. */
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
|
return false;
|
|
|
|
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
|
|
if (inst->alu.add.op != V3D_QPU_A_NOP) {
|
|
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
|
|
mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
|
|
return true;
|
|
}
|
|
if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
|
|
mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
|
|
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
|
|
mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
|
|
return true;
|
|
}
|
|
if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
|
|
mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/* XXX: imm */
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
writes_too_soon_after_write(const struct v3d_device_info *devinfo,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qinst *qinst)
|
|
{
|
|
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
|
|
|
/* Don't schedule any other r4 write too soon after an SFU write.
|
|
* This would normally be prevented by dependency tracking, but might
|
|
* occur if a dead SFU computation makes it to scheduling.
|
|
*/
|
|
if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
|
|
v3d_qpu_writes_r4(devinfo, inst))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
scoreboard_is_locked(struct choose_scoreboard *scoreboard,
|
|
bool lock_scoreboard_on_first_thrsw)
|
|
{
|
|
if (lock_scoreboard_on_first_thrsw) {
|
|
return scoreboard->first_thrsw_emitted &&
|
|
scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
|
|
}
|
|
|
|
return scoreboard->last_thrsw_emitted &&
|
|
scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
|
|
}
|
|
|
|
static bool
|
|
pixel_scoreboard_too_soon(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
const struct v3d_qpu_instr *inst)
|
|
{
|
|
return qpu_inst_is_tlb(inst) &&
|
|
!scoreboard_is_locked(scoreboard,
|
|
c->lock_scoreboard_on_first_thrsw);
|
|
}
|
|
|
|
static bool
|
|
qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
|
|
uint32_t waddr) {
|
|
|
|
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
|
return false;
|
|
|
|
if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
|
|
inst->raddr_a == waddr)
|
|
return true;
|
|
|
|
if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
|
|
!inst->sig.small_imm && (inst->raddr_b == waddr))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
mux_read_stalls(struct choose_scoreboard *scoreboard,
|
|
const struct v3d_qpu_instr *inst)
|
|
{
|
|
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
|
|
qpu_instruction_uses_rf(inst,
|
|
scoreboard->last_stallable_sfu_reg);
|
|
}
|
|
|
|
/* We define a max schedule priority to allow negative priorities as result of
|
|
* substracting this max when an instruction stalls. So instructions that
|
|
* stall have lower priority than regular instructions. */
|
|
#define MAX_SCHEDULE_PRIORITY 16
|
|
|
|
static int
|
|
get_instruction_priority(const struct v3d_device_info *devinfo,
|
|
const struct v3d_qpu_instr *inst)
|
|
{
|
|
uint32_t baseline_score;
|
|
uint32_t next_score = 0;
|
|
|
|
/* Schedule TLB operations as late as possible, to get more
|
|
* parallelism between shaders.
|
|
*/
|
|
if (qpu_inst_is_tlb(inst))
|
|
return next_score;
|
|
next_score++;
|
|
|
|
/* Empirical testing shows that using priorities to hide latency of
|
|
* TMU operations when scheduling QPU leads to slightly worse
|
|
* performance, even at 2 threads. We think this is because the thread
|
|
* switching is already quite effective at hiding latency and NIR
|
|
* scheduling (and possibly TMU pipelining too) are sufficient to hide
|
|
* TMU latency, so piling up on that here doesn't provide any benefits
|
|
* and instead may cause us to postpone critical paths that depend on
|
|
* the TMU results.
|
|
*/
|
|
#if 0
|
|
/* Schedule texture read results collection late to hide latency. */
|
|
if (v3d_qpu_waits_on_tmu(inst))
|
|
return next_score;
|
|
next_score++;
|
|
#endif
|
|
|
|
/* Default score for things that aren't otherwise special. */
|
|
baseline_score = next_score;
|
|
next_score++;
|
|
|
|
#if 0
|
|
/* Schedule texture read setup early to hide their latency better. */
|
|
if (v3d_qpu_writes_tmu(devinfo, inst))
|
|
return next_score;
|
|
next_score++;
|
|
#endif
|
|
|
|
/* We should increase the maximum if we assert here */
|
|
assert(next_score < MAX_SCHEDULE_PRIORITY);
|
|
|
|
return baseline_score;
|
|
}
|
|
|
|
enum {
|
|
V3D_PERIPHERAL_VPM_READ = (1 << 0),
|
|
V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
|
|
V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
|
|
V3D_PERIPHERAL_SFU = (1 << 3),
|
|
V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
|
|
V3D_PERIPHERAL_TMU_READ = (1 << 5),
|
|
V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
|
|
V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
|
|
V3D_PERIPHERAL_TSY = (1 << 8),
|
|
V3D_PERIPHERAL_TLB = (1 << 9),
|
|
};
|
|
|
|
static uint32_t
|
|
qpu_peripherals(const struct v3d_device_info *devinfo,
|
|
const struct v3d_qpu_instr *inst)
|
|
{
|
|
uint32_t result = 0;
|
|
if (v3d_qpu_reads_vpm(inst))
|
|
result |= V3D_PERIPHERAL_VPM_READ;
|
|
if (v3d_qpu_writes_vpm(inst))
|
|
result |= V3D_PERIPHERAL_VPM_WRITE;
|
|
if (v3d_qpu_waits_vpm(inst))
|
|
result |= V3D_PERIPHERAL_VPM_WAIT;
|
|
|
|
if (v3d_qpu_writes_tmu(devinfo, inst))
|
|
result |= V3D_PERIPHERAL_TMU_WRITE;
|
|
if (inst->sig.ldtmu)
|
|
result |= V3D_PERIPHERAL_TMU_READ;
|
|
if (inst->sig.wrtmuc)
|
|
result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
|
|
|
|
if (v3d_qpu_uses_sfu(inst))
|
|
result |= V3D_PERIPHERAL_SFU;
|
|
|
|
if (v3d_qpu_uses_tlb(inst))
|
|
result |= V3D_PERIPHERAL_TLB;
|
|
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
|
if (inst->alu.add.op != V3D_QPU_A_NOP &&
|
|
inst->alu.add.magic_write &&
|
|
v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
|
|
result |= V3D_PERIPHERAL_TSY;
|
|
}
|
|
|
|
if (inst->alu.add.op == V3D_QPU_A_TMUWT)
|
|
result |= V3D_PERIPHERAL_TMU_WAIT;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static bool
|
|
qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
|
|
const struct v3d_qpu_instr *a,
|
|
const struct v3d_qpu_instr *b)
|
|
{
|
|
const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
|
|
const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
|
|
|
|
/* We can always do one peripheral access per instruction. */
|
|
if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
|
|
return true;
|
|
|
|
if (devinfo->ver < 41)
|
|
return false;
|
|
|
|
/* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
|
|
* tmuc).
|
|
*/
|
|
if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
|
|
b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
|
|
return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
|
|
}
|
|
|
|
if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
|
|
b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
|
|
return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
|
|
}
|
|
|
|
/* V3D 4.1+ allows TMU read with VPM read/write. */
|
|
if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
|
(b_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
|
b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
|
return true;
|
|
}
|
|
if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
|
|
(a_peripherals == V3D_PERIPHERAL_VPM_READ ||
|
|
a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Compute a bitmask of which rf registers are used between
|
|
* the two instructions.
|
|
*/
|
|
static uint64_t
|
|
qpu_raddrs_used(const struct v3d_qpu_instr *a,
|
|
const struct v3d_qpu_instr *b)
|
|
{
|
|
assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
|
|
uint64_t raddrs_used = 0;
|
|
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
|
|
raddrs_used |= (1ll << a->raddr_a);
|
|
if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
|
|
raddrs_used |= (1ll << a->raddr_b);
|
|
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
|
|
raddrs_used |= (1ll << b->raddr_a);
|
|
if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
|
|
raddrs_used |= (1ll << b->raddr_b);
|
|
|
|
return raddrs_used;
|
|
}
|
|
|
|
/* Take two instructions and attempt to merge their raddr fields
|
|
* into one merged instruction. Returns false if the two instructions
|
|
* access more than two different rf registers between them, or more
|
|
* than one rf register and one small immediate.
|
|
*/
|
|
static bool
|
|
qpu_merge_raddrs(struct v3d_qpu_instr *result,
|
|
const struct v3d_qpu_instr *add_instr,
|
|
const struct v3d_qpu_instr *mul_instr)
|
|
{
|
|
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
|
|
int naddrs = util_bitcount64(raddrs_used);
|
|
|
|
if (naddrs > 2)
|
|
return false;
|
|
|
|
if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
|
|
if (naddrs > 1)
|
|
return false;
|
|
|
|
if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
|
|
if (add_instr->raddr_b != mul_instr->raddr_b)
|
|
return false;
|
|
|
|
result->sig.small_imm = true;
|
|
result->raddr_b = add_instr->sig.small_imm ?
|
|
add_instr->raddr_b : mul_instr->raddr_b;
|
|
}
|
|
|
|
if (naddrs == 0)
|
|
return true;
|
|
|
|
int raddr_a = ffsll(raddrs_used) - 1;
|
|
raddrs_used &= ~(1ll << raddr_a);
|
|
result->raddr_a = raddr_a;
|
|
|
|
if (!result->sig.small_imm) {
|
|
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
|
|
raddr_a == add_instr->raddr_b) {
|
|
if (add_instr->alu.add.a == V3D_QPU_MUX_B)
|
|
result->alu.add.a = V3D_QPU_MUX_A;
|
|
if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
|
|
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
|
|
result->alu.add.b = V3D_QPU_MUX_A;
|
|
}
|
|
}
|
|
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
|
|
raddr_a == mul_instr->raddr_b) {
|
|
if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
|
|
result->alu.mul.a = V3D_QPU_MUX_A;
|
|
if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
|
|
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
|
|
result->alu.mul.b = V3D_QPU_MUX_A;
|
|
}
|
|
}
|
|
}
|
|
if (!raddrs_used)
|
|
return true;
|
|
|
|
int raddr_b = ffsll(raddrs_used) - 1;
|
|
result->raddr_b = raddr_b;
|
|
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
|
|
raddr_b == add_instr->raddr_a) {
|
|
if (add_instr->alu.add.a == V3D_QPU_MUX_A)
|
|
result->alu.add.a = V3D_QPU_MUX_B;
|
|
if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
|
|
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
|
|
result->alu.add.b = V3D_QPU_MUX_B;
|
|
}
|
|
}
|
|
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
|
|
raddr_b == mul_instr->raddr_a) {
|
|
if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
|
|
result->alu.mul.a = V3D_QPU_MUX_B;
|
|
if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
|
|
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
|
|
result->alu.mul.b = V3D_QPU_MUX_B;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
can_do_add_as_mul(enum v3d_qpu_add_op op)
|
|
{
|
|
switch (op) {
|
|
case V3D_QPU_A_ADD:
|
|
case V3D_QPU_A_SUB:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static enum v3d_qpu_mul_op
|
|
add_op_as_mul_op(enum v3d_qpu_add_op op)
|
|
{
|
|
switch (op) {
|
|
case V3D_QPU_A_ADD:
|
|
return V3D_QPU_M_ADD;
|
|
case V3D_QPU_A_SUB:
|
|
return V3D_QPU_M_SUB;
|
|
default:
|
|
unreachable("unexpected add opcode");
|
|
}
|
|
}
|
|
|
|
static void
|
|
qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
|
|
{
|
|
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
|
|
assert(inst->alu.add.op != V3D_QPU_A_NOP);
|
|
assert(inst->alu.mul.op == V3D_QPU_M_NOP);
|
|
|
|
memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
|
|
inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
|
|
inst->alu.add.op = V3D_QPU_A_NOP;
|
|
|
|
inst->flags.mc = inst->flags.ac;
|
|
inst->flags.mpf = inst->flags.apf;
|
|
inst->flags.muf = inst->flags.auf;
|
|
inst->flags.ac = V3D_QPU_COND_NONE;
|
|
inst->flags.apf = V3D_QPU_PF_NONE;
|
|
inst->flags.auf = V3D_QPU_UF_NONE;
|
|
|
|
inst->alu.mul.output_pack = inst->alu.add.output_pack;
|
|
inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
|
|
inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
|
|
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
|
|
inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
|
|
inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
|
|
}
|
|
|
|
static bool
|
|
qpu_merge_inst(const struct v3d_device_info *devinfo,
|
|
struct v3d_qpu_instr *result,
|
|
const struct v3d_qpu_instr *a,
|
|
const struct v3d_qpu_instr *b)
|
|
{
|
|
if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
|
|
b->type != V3D_QPU_INSTR_TYPE_ALU) {
|
|
return false;
|
|
}
|
|
|
|
if (!qpu_compatible_peripheral_access(devinfo, a, b))
|
|
return false;
|
|
|
|
struct v3d_qpu_instr merge = *a;
|
|
const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
|
|
|
|
struct v3d_qpu_instr mul_inst;
|
|
if (b->alu.add.op != V3D_QPU_A_NOP) {
|
|
if (a->alu.add.op == V3D_QPU_A_NOP) {
|
|
merge.alu.add = b->alu.add;
|
|
|
|
merge.flags.ac = b->flags.ac;
|
|
merge.flags.apf = b->flags.apf;
|
|
merge.flags.auf = b->flags.auf;
|
|
|
|
add_instr = b;
|
|
mul_instr = a;
|
|
}
|
|
/* If a's add op is used but its mul op is not, then see if we
|
|
* can convert either a's add op or b's add op to a mul op
|
|
* so we can merge.
|
|
*/
|
|
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
|
|
can_do_add_as_mul(b->alu.add.op)) {
|
|
mul_inst = *b;
|
|
qpu_convert_add_to_mul(&mul_inst);
|
|
|
|
merge.alu.mul = mul_inst.alu.mul;
|
|
|
|
merge.flags.mc = b->flags.ac;
|
|
merge.flags.mpf = b->flags.apf;
|
|
merge.flags.muf = b->flags.auf;
|
|
|
|
add_instr = a;
|
|
mul_instr = &mul_inst;
|
|
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
|
|
can_do_add_as_mul(a->alu.add.op)) {
|
|
mul_inst = *a;
|
|
qpu_convert_add_to_mul(&mul_inst);
|
|
|
|
merge = mul_inst;
|
|
merge.alu.add = b->alu.add;
|
|
|
|
merge.flags.ac = b->flags.ac;
|
|
merge.flags.apf = b->flags.apf;
|
|
merge.flags.auf = b->flags.auf;
|
|
|
|
add_instr = b;
|
|
mul_instr = &mul_inst;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (b->alu.mul.op != V3D_QPU_M_NOP) {
|
|
if (a->alu.mul.op != V3D_QPU_M_NOP)
|
|
return false;
|
|
merge.alu.mul = b->alu.mul;
|
|
|
|
merge.flags.mc = b->flags.mc;
|
|
merge.flags.mpf = b->flags.mpf;
|
|
merge.flags.muf = b->flags.muf;
|
|
|
|
mul_instr = b;
|
|
add_instr = a;
|
|
}
|
|
|
|
if (add_instr && mul_instr &&
|
|
!qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
|
|
return false;
|
|
}
|
|
|
|
merge.sig.thrsw |= b->sig.thrsw;
|
|
merge.sig.ldunif |= b->sig.ldunif;
|
|
merge.sig.ldunifrf |= b->sig.ldunifrf;
|
|
merge.sig.ldunifa |= b->sig.ldunifa;
|
|
merge.sig.ldunifarf |= b->sig.ldunifarf;
|
|
merge.sig.ldtmu |= b->sig.ldtmu;
|
|
merge.sig.ldvary |= b->sig.ldvary;
|
|
merge.sig.ldvpm |= b->sig.ldvpm;
|
|
merge.sig.small_imm |= b->sig.small_imm;
|
|
merge.sig.ldtlb |= b->sig.ldtlb;
|
|
merge.sig.ldtlbu |= b->sig.ldtlbu;
|
|
merge.sig.ucb |= b->sig.ucb;
|
|
merge.sig.rotate |= b->sig.rotate;
|
|
merge.sig.wrtmuc |= b->sig.wrtmuc;
|
|
|
|
if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
|
|
v3d_qpu_sig_writes_address(devinfo, &b->sig))
|
|
return false;
|
|
merge.sig_addr |= b->sig_addr;
|
|
merge.sig_magic |= b->sig_magic;
|
|
|
|
uint64_t packed;
|
|
bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
|
|
|
|
*result = merge;
|
|
/* No modifying the real instructions on failure. */
|
|
assert(ok || (a != result && b != result));
|
|
|
|
return ok;
|
|
}
|
|
|
|
static inline bool
|
|
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
|
|
{
|
|
return inst->sig.ldunif || inst->sig.ldunifrf;
|
|
}
|
|
|
|
static bool
|
|
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
const struct qinst *qinst);
|
|
|
|
static struct schedule_node *
|
|
choose_instruction_to_schedule(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct schedule_node *prev_inst)
|
|
{
|
|
struct schedule_node *chosen = NULL;
|
|
int chosen_prio = 0;
|
|
|
|
/* Don't pair up anything with a thread switch signal -- emit_thrsw()
|
|
* will handle pairing it along with filling the delay slots.
|
|
*/
|
|
if (prev_inst) {
|
|
if (prev_inst->inst->qpu.sig.thrsw)
|
|
return NULL;
|
|
}
|
|
|
|
bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
|
|
scoreboard->ldvary_count < c->num_inputs;
|
|
bool skipped_insts_for_ldvary_pipelining = false;
|
|
retry:
|
|
list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
|
|
dag.link) {
|
|
const struct v3d_qpu_instr *inst = &n->inst->qpu;
|
|
|
|
if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
|
|
skipped_insts_for_ldvary_pipelining = true;
|
|
continue;
|
|
}
|
|
|
|
/* Don't choose the branch instruction until it's the last one
|
|
* left. We'll move it up to fit its delay slots after we
|
|
* choose it.
|
|
*/
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
|
|
!list_is_singular(&scoreboard->dag->heads)) {
|
|
continue;
|
|
}
|
|
|
|
/* We need to have 3 delay slots between a write to unifa and
|
|
* a follow-up ldunifa.
|
|
*/
|
|
if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
|
|
scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
|
|
continue;
|
|
|
|
/* "An instruction must not read from a location in physical
|
|
* regfile A or B that was written to by the previous
|
|
* instruction."
|
|
*/
|
|
if (reads_too_soon_after_write(scoreboard, n->inst))
|
|
continue;
|
|
|
|
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
|
|
continue;
|
|
|
|
/* "Before doing a TLB access a scoreboard wait must have been
|
|
* done. This happens either on the first or last thread
|
|
* switch, depending on a setting (scb_wait_on_first_thrsw) in
|
|
* the shader state."
|
|
*/
|
|
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
|
|
continue;
|
|
|
|
/* ldunif and ldvary both write r5, but ldunif does so a tick
|
|
* sooner. If the ldvary's r5 wasn't used, then ldunif might
|
|
* otherwise get scheduled so ldunif and ldvary try to update
|
|
* r5 in the same tick.
|
|
*/
|
|
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
|
|
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
|
|
continue;
|
|
}
|
|
|
|
/* If we are in a thrsw delay slot check that this instruction
|
|
* is valid for that.
|
|
*/
|
|
if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
|
|
!qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
|
|
n->inst)) {
|
|
continue;
|
|
}
|
|
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
|
/* Don't try to put a branch in the delay slots of another
|
|
* branch or a unifa write.
|
|
*/
|
|
if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
|
|
continue;
|
|
if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
|
|
continue;
|
|
|
|
/* No branch with cond != 0,2,3 and msfign != 0 after
|
|
* setmsf.
|
|
*/
|
|
if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
|
|
inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
|
|
inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
|
|
inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
|
|
inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* If we're trying to pair with another instruction, check
|
|
* that they're compatible.
|
|
*/
|
|
if (prev_inst) {
|
|
/* Don't pair up a thread switch signal -- we'll
|
|
* handle pairing it when we pick it on its own.
|
|
*/
|
|
if (inst->sig.thrsw)
|
|
continue;
|
|
|
|
if (prev_inst->inst->uniform != -1 &&
|
|
n->inst->uniform != -1)
|
|
continue;
|
|
|
|
/* Simulator complains if we have two uniforms loaded in
|
|
* the the same instruction, which could happen if we
|
|
* have a ldunif or sideband uniform and we pair that
|
|
* with ldunifa.
|
|
*/
|
|
if (vir_has_uniform(prev_inst->inst) &&
|
|
(inst->sig.ldunifa || inst->sig.ldunifarf)) {
|
|
continue;
|
|
}
|
|
|
|
if ((prev_inst->inst->qpu.sig.ldunifa ||
|
|
prev_inst->inst->qpu.sig.ldunifarf) &&
|
|
vir_has_uniform(n->inst)) {
|
|
continue;
|
|
}
|
|
|
|
/* Don't merge TLB instructions before we have acquired
|
|
* the scoreboard lock.
|
|
*/
|
|
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
|
|
continue;
|
|
|
|
/* When we succesfully pair up an ldvary we then try
|
|
* to merge it into the previous instruction if
|
|
* possible to improve pipelining. Don't pick up the
|
|
* ldvary now if the follow-up fixup would place
|
|
* it in the delay slots of a thrsw, which is not
|
|
* allowed and would prevent the fixup from being
|
|
* successul.
|
|
*/
|
|
if (inst->sig.ldvary &&
|
|
scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
|
|
continue;
|
|
}
|
|
|
|
struct v3d_qpu_instr merged_inst;
|
|
if (!qpu_merge_inst(c->devinfo, &merged_inst,
|
|
&prev_inst->inst->qpu, inst)) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
int prio = get_instruction_priority(c->devinfo, inst);
|
|
|
|
if (mux_read_stalls(scoreboard, inst)) {
|
|
/* Don't merge an instruction that stalls */
|
|
if (prev_inst)
|
|
continue;
|
|
else {
|
|
/* Any instruction that don't stall will have
|
|
* higher scheduling priority */
|
|
prio -= MAX_SCHEDULE_PRIORITY;
|
|
assert(prio < 0);
|
|
}
|
|
}
|
|
|
|
/* Found a valid instruction. If nothing better comes along,
|
|
* this one works.
|
|
*/
|
|
if (!chosen) {
|
|
chosen = n;
|
|
chosen_prio = prio;
|
|
continue;
|
|
}
|
|
|
|
if (prio > chosen_prio) {
|
|
chosen = n;
|
|
chosen_prio = prio;
|
|
} else if (prio < chosen_prio) {
|
|
continue;
|
|
}
|
|
|
|
if (n->delay > chosen->delay) {
|
|
chosen = n;
|
|
chosen_prio = prio;
|
|
} else if (n->delay < chosen->delay) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* If we did not find any instruction to schedule but we discarded
|
|
* some of them to prioritize ldvary pipelining, try again.
|
|
*/
|
|
if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
|
|
skipped_insts_for_ldvary_pipelining = false;
|
|
ldvary_pipelining = false;
|
|
goto retry;
|
|
}
|
|
|
|
if (chosen && chosen->inst->qpu.sig.ldvary) {
|
|
scoreboard->ldvary_count++;
|
|
/* If we are pairing an ldvary, flag it so we can fix it up for
|
|
* optimal pipelining of ldvary sequences.
|
|
*/
|
|
if (prev_inst)
|
|
scoreboard->fixup_ldvary = true;
|
|
}
|
|
|
|
return chosen;
|
|
}
|
|
|
|
static void
|
|
update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
|
|
enum v3d_qpu_waddr waddr,
|
|
const struct v3d_device_info *devinfo)
|
|
{
|
|
if (v3d_qpu_magic_waddr_is_sfu(waddr))
|
|
scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
|
|
else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
|
|
scoreboard->last_unifa_write_tick = scoreboard->tick;
|
|
}
|
|
|
|
static void
|
|
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
|
|
const struct v3d_qpu_instr *inst)
|
|
{
|
|
if (v3d_qpu_instr_is_sfu(inst)) {
|
|
scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
|
|
scoreboard->last_stallable_sfu_tick = scoreboard->tick;
|
|
}
|
|
}
|
|
|
|
static void
|
|
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
|
|
const struct v3d_qpu_instr *inst,
|
|
const struct v3d_device_info *devinfo)
|
|
{
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
|
return;
|
|
|
|
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
|
|
if (inst->alu.add.op != V3D_QPU_A_NOP) {
|
|
if (inst->alu.add.magic_write) {
|
|
update_scoreboard_for_magic_waddr(scoreboard,
|
|
inst->alu.add.waddr,
|
|
devinfo);
|
|
} else {
|
|
update_scoreboard_for_sfu_stall_waddr(scoreboard,
|
|
inst);
|
|
}
|
|
|
|
if (inst->alu.add.op == V3D_QPU_A_SETMSF)
|
|
scoreboard->last_setmsf_tick = scoreboard->tick;
|
|
}
|
|
|
|
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
|
|
if (inst->alu.mul.magic_write) {
|
|
update_scoreboard_for_magic_waddr(scoreboard,
|
|
inst->alu.mul.waddr,
|
|
devinfo);
|
|
}
|
|
}
|
|
|
|
if (inst->sig.ldvary)
|
|
scoreboard->last_ldvary_tick = scoreboard->tick;
|
|
}
|
|
|
|
static void
|
|
dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
|
|
{
|
|
list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
|
|
fprintf(stderr, " t=%4d: ", n->unblocked_time);
|
|
v3d_qpu_dump(devinfo, &n->inst->qpu);
|
|
fprintf(stderr, "\n");
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
struct schedule_node *child =
|
|
(struct schedule_node *)edge->child;
|
|
if (!child)
|
|
continue;
|
|
|
|
fprintf(stderr, " - ");
|
|
v3d_qpu_dump(devinfo, &child->inst->qpu);
|
|
fprintf(stderr, " (%d parents, %c)\n",
|
|
child->dag.parent_count,
|
|
edge->data ? 'w' : 'r');
|
|
}
|
|
}
|
|
}
|
|
|
|
static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
|
|
enum v3d_qpu_waddr waddr,
|
|
const struct v3d_qpu_instr *after)
|
|
{
|
|
/* Apply some huge latency between texture fetch requests and getting
|
|
* their results back.
|
|
*
|
|
* FIXME: This is actually pretty bogus. If we do:
|
|
*
|
|
* mov tmu0_s, a
|
|
* <a bit of math>
|
|
* mov tmu0_s, b
|
|
* load_tmu0
|
|
* <more math>
|
|
* load_tmu0
|
|
*
|
|
* we count that as worse than
|
|
*
|
|
* mov tmu0_s, a
|
|
* mov tmu0_s, b
|
|
* <lots of math>
|
|
* load_tmu0
|
|
* <more math>
|
|
* load_tmu0
|
|
*
|
|
* because we associate the first load_tmu0 with the *second* tmu0_s.
|
|
*/
|
|
if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
|
|
v3d_qpu_waits_on_tmu(after)) {
|
|
return 100;
|
|
}
|
|
|
|
/* Assume that anything depending on us is consuming the SFU result. */
|
|
if (v3d_qpu_magic_waddr_is_sfu(waddr))
|
|
return 3;
|
|
|
|
return 1;
|
|
}
|
|
|
|
static uint32_t
|
|
instruction_latency(const struct v3d_device_info *devinfo,
|
|
struct schedule_node *before, struct schedule_node *after)
|
|
{
|
|
const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
|
|
const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
|
|
uint32_t latency = 1;
|
|
|
|
if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
|
|
after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
|
|
return latency;
|
|
|
|
if (before_inst->alu.add.magic_write) {
|
|
latency = MAX2(latency,
|
|
magic_waddr_latency(devinfo,
|
|
before_inst->alu.add.waddr,
|
|
after_inst));
|
|
}
|
|
|
|
if (before_inst->alu.mul.magic_write) {
|
|
latency = MAX2(latency,
|
|
magic_waddr_latency(devinfo,
|
|
before_inst->alu.mul.waddr,
|
|
after_inst));
|
|
}
|
|
|
|
if (v3d_qpu_instr_is_sfu(before_inst))
|
|
return 2;
|
|
|
|
return latency;
|
|
}
|
|
|
|
/** Recursive computation of the delay member of a node. */
|
|
static void
|
|
compute_delay(struct dag_node *node, void *state)
|
|
{
|
|
struct schedule_node *n = (struct schedule_node *)node;
|
|
struct v3d_compile *c = (struct v3d_compile *) state;
|
|
|
|
n->delay = 1;
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
struct schedule_node *child =
|
|
(struct schedule_node *)edge->child;
|
|
|
|
n->delay = MAX2(n->delay, (child->delay +
|
|
instruction_latency(c->devinfo, n,
|
|
child)));
|
|
}
|
|
}
|
|
|
|
/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
|
|
* should be called on it later to finish pruning the other edges).
|
|
*/
|
|
static void
|
|
pre_remove_head(struct dag *dag, struct schedule_node *n)
|
|
{
|
|
list_delinit(&n->dag.link);
|
|
|
|
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
|
|
if (edge->data)
|
|
dag_remove_edge(dag, edge);
|
|
}
|
|
}
|
|
|
|
static void
|
|
mark_instruction_scheduled(const struct v3d_device_info *devinfo,
|
|
struct dag *dag,
|
|
uint32_t time,
|
|
struct schedule_node *node)
|
|
{
|
|
if (!node)
|
|
return;
|
|
|
|
util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
|
|
struct schedule_node *child =
|
|
(struct schedule_node *)edge->child;
|
|
|
|
if (!child)
|
|
continue;
|
|
|
|
uint32_t latency = instruction_latency(devinfo, node, child);
|
|
|
|
child->unblocked_time = MAX2(child->unblocked_time,
|
|
time + latency);
|
|
}
|
|
dag_prune_head(dag, &node->dag);
|
|
}
|
|
|
|
static void
|
|
insert_scheduled_instruction(struct v3d_compile *c,
|
|
struct qblock *block,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qinst *inst)
|
|
{
|
|
list_addtail(&inst->link, &block->instructions);
|
|
|
|
update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
|
|
c->qpu_inst_count++;
|
|
scoreboard->tick++;
|
|
}
|
|
|
|
static struct qinst *
|
|
vir_nop()
|
|
{
|
|
struct qreg undef = vir_nop_reg();
|
|
struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
|
|
|
|
return qinst;
|
|
}
|
|
|
|
static void
|
|
emit_nop(struct v3d_compile *c, struct qblock *block,
|
|
struct choose_scoreboard *scoreboard)
|
|
{
|
|
insert_scheduled_instruction(c, block, scoreboard, vir_nop());
|
|
}
|
|
|
|
static bool
|
|
qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
|
|
const struct qinst *qinst, int slot)
|
|
{
|
|
const struct v3d_qpu_instr *inst = &qinst->qpu;
|
|
|
|
if (slot == 2 && qinst->is_tlb_z_write)
|
|
return false;
|
|
|
|
if (slot > 0 && qinst->uniform != ~0)
|
|
return false;
|
|
|
|
if (v3d_qpu_waits_vpm(inst))
|
|
return false;
|
|
|
|
if (inst->sig.ldvary)
|
|
return false;
|
|
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
|
|
/* GFXH-1625: TMUWT not allowed in the final instruction. */
|
|
if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
|
|
return false;
|
|
|
|
/* No writing physical registers at the end. */
|
|
if (!inst->alu.add.magic_write ||
|
|
!inst->alu.mul.magic_write) {
|
|
return false;
|
|
}
|
|
|
|
if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
|
|
!inst->sig_magic) {
|
|
return false;
|
|
}
|
|
|
|
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
|
|
return false;
|
|
|
|
/* RF0-2 might be overwritten during the delay slots by
|
|
* fragment shader setup.
|
|
*/
|
|
if (inst->raddr_a < 3 &&
|
|
(inst->alu.add.a == V3D_QPU_MUX_A ||
|
|
inst->alu.add.b == V3D_QPU_MUX_A ||
|
|
inst->alu.mul.a == V3D_QPU_MUX_A ||
|
|
inst->alu.mul.b == V3D_QPU_MUX_A)) {
|
|
return false;
|
|
}
|
|
|
|
if (inst->raddr_b < 3 &&
|
|
!inst->sig.small_imm &&
|
|
(inst->alu.add.a == V3D_QPU_MUX_B ||
|
|
inst->alu.add.b == V3D_QPU_MUX_B ||
|
|
inst->alu.mul.a == V3D_QPU_MUX_B ||
|
|
inst->alu.mul.b == V3D_QPU_MUX_B)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* This is called when trying to merge a thrsw back into the instruction stream
|
|
* of instructions that were scheduled *before* the thrsw signal to fill its
|
|
* delay slots. Because the actual execution of the thrsw happens after the
|
|
* delay slots, it is usually safe to do this, but there are some cases that
|
|
* need special care.
|
|
*/
|
|
static bool
|
|
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
|
const struct qinst *qinst,
|
|
uint32_t slot)
|
|
{
|
|
/* No scheduling SFU when the result would land in the other
|
|
* thread. The simulator complains for safety, though it
|
|
* would only occur for dead code in our case.
|
|
*/
|
|
if (slot > 0 &&
|
|
qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
|
|
(v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
|
|
v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
|
|
return false;
|
|
}
|
|
|
|
if (slot > 0 && qinst->qpu.sig.ldvary)
|
|
return false;
|
|
|
|
/* unifa and the following 3 instructions can't overlap a
|
|
* thread switch/end. The docs further clarify that this means
|
|
* the cycle at which the actual thread switch/end happens
|
|
* and not when the thrsw instruction is processed, which would
|
|
* be after the 2 delay slots following the thrsw instruction.
|
|
* This means that we can move up a thrsw up to the instruction
|
|
* right after unifa:
|
|
*
|
|
* unifa, r5
|
|
* thrsw
|
|
* delay slot 1
|
|
* delay slot 2
|
|
* Thread switch happens here, 4 instructions away from unifa
|
|
*/
|
|
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* This is called for instructions scheduled *after* a thrsw signal that may
|
|
* land in the delay slots of the thrsw. Because these instructions were
|
|
* scheduled after the thrsw, we need to be careful when placing them into
|
|
* the delay slots, since that means that we are moving them ahead of the
|
|
* thread switch and we need to ensure that is not a problem.
|
|
*/
|
|
static bool
|
|
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
const struct qinst *qinst)
|
|
{
|
|
const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
|
|
assert(slot <= 2);
|
|
|
|
/* We merge thrsw instructions back into the instruction stream
|
|
* manually, so any instructions scheduled after a thrsw shold be
|
|
* in the actual delay slots and not in the same slot as the thrsw.
|
|
*/
|
|
assert(slot >= 1);
|
|
|
|
/* No emitting a thrsw while the previous thrsw hasn't happened yet. */
|
|
if (qinst->qpu.sig.thrsw)
|
|
return false;
|
|
|
|
/* The restrictions for instructions scheduled before the the thrsw
|
|
* also apply to instructions scheduled after the thrsw that we want
|
|
* to place in its delay slots.
|
|
*/
|
|
if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
|
|
return false;
|
|
|
|
/* TLB access is disallowed until scoreboard wait is executed, which
|
|
* we do on the last thread switch.
|
|
*/
|
|
if (qpu_inst_is_tlb(&qinst->qpu))
|
|
return false;
|
|
|
|
/* Instruction sequence restrictions: Branch is not allowed in delay
|
|
* slots of a thrsw.
|
|
*/
|
|
if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
|
|
return false;
|
|
|
|
/* Miscellaneous restrictions: At the point of a thrsw we need to have
|
|
* at least one outstanding lookup or TSY wait.
|
|
*
|
|
* So avoid placing TMU instructions scheduled after the thrsw into
|
|
* its delay slots or we may be compromising the integrity of our TMU
|
|
* sequences. Also, notice that if we moved these instructions into
|
|
* the delay slots of a previous thrsw we could overflow our TMU output
|
|
* fifo, since we could be effectively pipelining a lookup scheduled
|
|
* after the thrsw into the sequence before the thrsw.
|
|
*/
|
|
if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
|
|
qinst->qpu.sig.wrtmuc) {
|
|
return false;
|
|
}
|
|
|
|
/* Don't move instructions that wait on the TMU before the thread switch
|
|
* happens since that would make the current thread stall before the
|
|
* switch, which is exactly what we want to avoid with the thrsw
|
|
* instruction.
|
|
*/
|
|
if (v3d_qpu_waits_on_tmu(&qinst->qpu))
|
|
return false;
|
|
|
|
/* A thread switch invalidates all accumulators, so don't place any
|
|
* instructions that write accumulators into the delay slots.
|
|
*/
|
|
if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
|
|
return false;
|
|
|
|
/* Multop has an implicit write to the rtop register which is an
|
|
* specialized accumulator that is only used with this instruction.
|
|
*/
|
|
if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
|
|
return false;
|
|
|
|
/* Flags are invalidated across a thread switch, so dont' place
|
|
* instructions that write flags into delay slots.
|
|
*/
|
|
if (v3d_qpu_writes_flags(&qinst->qpu))
|
|
return false;
|
|
|
|
/* TSY sync ops materialize at the point of the next thread switch,
|
|
* therefore, if we have a TSY sync right after a thread switch, we
|
|
* cannot place it in its delay slots, or we would be moving the sync
|
|
* to the thrsw before it instead.
|
|
*/
|
|
if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
|
|
struct qinst *qinst, int instructions_in_sequence,
|
|
bool is_thrend)
|
|
{
|
|
/* No emitting our thrsw while the previous thrsw hasn't happened yet. */
|
|
if (scoreboard->last_thrsw_tick + 3 >
|
|
scoreboard->tick - instructions_in_sequence) {
|
|
return false;
|
|
}
|
|
|
|
for (int slot = 0; slot < instructions_in_sequence; slot++) {
|
|
if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
|
|
return false;
|
|
|
|
if (is_thrend &&
|
|
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
|
|
return false;
|
|
}
|
|
|
|
/* Note that the list is circular, so we can only do this up
|
|
* to instructions_in_sequence.
|
|
*/
|
|
qinst = (struct qinst *)qinst->link.next;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Emits a THRSW signal in the stream, trying to move it up to pair with
|
|
* another instruction.
|
|
*/
|
|
static int
|
|
emit_thrsw(struct v3d_compile *c,
|
|
struct qblock *block,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qinst *inst,
|
|
bool is_thrend)
|
|
{
|
|
int time = 0;
|
|
|
|
/* There should be nothing in a thrsw inst being scheduled other than
|
|
* the signal bits.
|
|
*/
|
|
assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
|
|
assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
|
assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
|
|
|
/* Don't try to emit a thrsw in the delay slots of a previous thrsw
|
|
* or branch.
|
|
*/
|
|
while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
|
|
emit_nop(c, block, scoreboard);
|
|
time++;
|
|
}
|
|
while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
|
|
emit_nop(c, block, scoreboard);
|
|
time++;
|
|
}
|
|
|
|
/* Find how far back into previous instructions we can put the THRSW. */
|
|
int slots_filled = 0;
|
|
int invalid_sig_count = 0;
|
|
bool last_thrsw_after_invalid_ok = false;
|
|
struct qinst *merge_inst = NULL;
|
|
vir_for_each_inst_rev(prev_inst, block) {
|
|
if (!valid_thrsw_sequence(c, scoreboard,
|
|
prev_inst, slots_filled + 1,
|
|
is_thrend)) {
|
|
break;
|
|
}
|
|
|
|
struct v3d_qpu_sig sig = prev_inst->qpu.sig;
|
|
sig.thrsw = true;
|
|
uint32_t packed_sig;
|
|
if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
|
|
/* If we can't merge the thrsw here because of signal
|
|
* incompatibility, keep going, we might be able to
|
|
* merge it in an earlier instruction.
|
|
*/
|
|
invalid_sig_count++;
|
|
goto cont_block;
|
|
}
|
|
|
|
/* For last thrsw we need 2 consecutive slots that are
|
|
* thrsw compatible, so if we have previously jumped over
|
|
* an incompatible signal, flag that we have found the first
|
|
* valid slot here and keep going.
|
|
*/
|
|
if (inst->is_last_thrsw && invalid_sig_count > 0 &&
|
|
!last_thrsw_after_invalid_ok) {
|
|
last_thrsw_after_invalid_ok = true;
|
|
invalid_sig_count++;
|
|
goto cont_block;
|
|
}
|
|
|
|
last_thrsw_after_invalid_ok = false;
|
|
invalid_sig_count = 0;
|
|
merge_inst = prev_inst;
|
|
|
|
cont_block:
|
|
if (++slots_filled == 3)
|
|
break;
|
|
}
|
|
|
|
/* If we jumped over a signal incompatibility and did not manage to
|
|
* merge the thrsw in the end, we need to adjust slots filled to match
|
|
* the last valid merge point.
|
|
*/
|
|
assert(invalid_sig_count == 0 || slots_filled >= invalid_sig_count);
|
|
if (invalid_sig_count > 0)
|
|
slots_filled -= invalid_sig_count;
|
|
|
|
bool needs_free = false;
|
|
if (merge_inst) {
|
|
merge_inst->qpu.sig.thrsw = true;
|
|
needs_free = true;
|
|
scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
|
|
} else {
|
|
scoreboard->last_thrsw_tick = scoreboard->tick;
|
|
insert_scheduled_instruction(c, block, scoreboard, inst);
|
|
time++;
|
|
slots_filled++;
|
|
merge_inst = inst;
|
|
}
|
|
|
|
scoreboard->first_thrsw_emitted = true;
|
|
|
|
/* If we're emitting the last THRSW (other than program end), then
|
|
* signal that to the HW by emitting two THRSWs in a row.
|
|
*/
|
|
if (inst->is_last_thrsw) {
|
|
if (slots_filled <= 1) {
|
|
emit_nop(c, block, scoreboard);
|
|
time++;
|
|
}
|
|
struct qinst *second_inst =
|
|
(struct qinst *)merge_inst->link.next;
|
|
second_inst->qpu.sig.thrsw = true;
|
|
scoreboard->last_thrsw_emitted = true;
|
|
}
|
|
|
|
/* Make sure the thread end executes within the program lifespan */
|
|
if (is_thrend) {
|
|
for (int i = 0; i < 3 - slots_filled; i++) {
|
|
emit_nop(c, block, scoreboard);
|
|
time++;
|
|
}
|
|
}
|
|
|
|
/* If we put our THRSW into another instruction, free up the
|
|
* instruction that didn't end up scheduled into the list.
|
|
*/
|
|
if (needs_free)
|
|
free(inst);
|
|
|
|
return time;
|
|
}
|
|
|
|
static bool
|
|
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
|
|
{
|
|
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
|
|
return false;
|
|
|
|
if (inst->qpu.sig.thrsw)
|
|
return false;
|
|
|
|
if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
|
|
return false;
|
|
|
|
if (vir_has_uniform(inst))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
emit_branch(struct v3d_compile *c,
|
|
struct qblock *block,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qinst *inst)
|
|
{
|
|
assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
|
|
|
|
/* We should've not picked up a branch for the delay slots of a previous
|
|
* thrsw, branch or unifa write instruction.
|
|
*/
|
|
int branch_tick = scoreboard->tick;
|
|
assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
|
|
assert(scoreboard->last_branch_tick + 3 < branch_tick);
|
|
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
|
|
|
|
/* Can't place a branch with msfign != 0 and cond != 0,2,3 after
|
|
* setmsf.
|
|
*/
|
|
bool is_safe_msf_branch =
|
|
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
|
|
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
|
|
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
|
|
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
|
|
assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
|
|
is_safe_msf_branch);
|
|
|
|
/* Insert the branch instruction */
|
|
insert_scheduled_instruction(c, block, scoreboard, inst);
|
|
|
|
/* Now see if we can move the branch instruction back into the
|
|
* instruction stream to fill its delay slots
|
|
*/
|
|
int slots_filled = 0;
|
|
while (slots_filled < 3 && block->instructions.next != &inst->link) {
|
|
struct qinst *prev_inst = (struct qinst *) inst->link.prev;
|
|
assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
|
|
|
|
/* Can't move the branch instruction if that would place it
|
|
* in the delay slots of other instructions.
|
|
*/
|
|
if (scoreboard->last_branch_tick + 3 >=
|
|
branch_tick - slots_filled - 1) {
|
|
break;
|
|
}
|
|
|
|
if (scoreboard->last_thrsw_tick + 2 >=
|
|
branch_tick - slots_filled - 1) {
|
|
break;
|
|
}
|
|
|
|
if (scoreboard->last_unifa_write_tick + 3 >=
|
|
branch_tick - slots_filled - 1) {
|
|
break;
|
|
}
|
|
|
|
/* Can't move a conditional branch before the instruction
|
|
* that writes the flags for its condition.
|
|
*/
|
|
if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
|
|
inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
|
|
break;
|
|
}
|
|
|
|
if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
|
|
break;
|
|
|
|
if (!is_safe_msf_branch) {
|
|
struct qinst *prev_prev_inst =
|
|
(struct qinst *) prev_inst->link.prev;
|
|
if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
|
|
prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
list_del(&prev_inst->link);
|
|
list_add(&prev_inst->link, &inst->link);
|
|
slots_filled++;
|
|
}
|
|
|
|
block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
|
|
scoreboard->last_branch_tick = branch_tick - slots_filled;
|
|
|
|
/* Fill any remaining delay slots.
|
|
*
|
|
* For unconditional branches we'll try to fill these with the
|
|
* first instructions in the successor block after scheduling
|
|
* all blocks when setting up branch targets.
|
|
*/
|
|
for (int i = 0; i < 3 - slots_filled; i++)
|
|
emit_nop(c, block, scoreboard);
|
|
}
|
|
|
|
static bool
|
|
alu_reads_register(struct v3d_qpu_instr *inst,
|
|
bool add, bool magic, uint32_t index)
|
|
{
|
|
uint32_t num_src;
|
|
enum v3d_qpu_mux mux_a, mux_b;
|
|
|
|
if (add) {
|
|
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
|
|
mux_a = inst->alu.add.a;
|
|
mux_b = inst->alu.add.b;
|
|
} else {
|
|
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
|
|
mux_a = inst->alu.mul.a;
|
|
mux_b = inst->alu.mul.b;
|
|
}
|
|
|
|
for (int i = 0; i < num_src; i++) {
|
|
if (magic) {
|
|
if (i == 0 && mux_a == index)
|
|
return true;
|
|
if (i == 1 && mux_b == index)
|
|
return true;
|
|
} else {
|
|
if (i == 0 && mux_a == V3D_QPU_MUX_A &&
|
|
inst->raddr_a == index) {
|
|
return true;
|
|
}
|
|
if (i == 0 && mux_a == V3D_QPU_MUX_B &&
|
|
inst->raddr_b == index) {
|
|
return true;
|
|
}
|
|
if (i == 1 && mux_b == V3D_QPU_MUX_A &&
|
|
inst->raddr_a == index) {
|
|
return true;
|
|
}
|
|
if (i == 1 && mux_b == V3D_QPU_MUX_B &&
|
|
inst->raddr_b == index) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* This takes and ldvary signal merged into 'inst' and tries to move it up to
|
|
* the previous instruction to get good pipelining of ldvary sequences,
|
|
* transforming this:
|
|
*
|
|
* nop ; nop ; ldvary.r4
|
|
* nop ; fmul r0, r4, rf0 ;
|
|
* fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst
|
|
*
|
|
* into:
|
|
*
|
|
* nop ; nop ; ldvary.r4
|
|
* nop ; fmul r0, r4, rf0 ; ldvary.r1
|
|
* fadd rf13, r0, r5 ; nop; ; <-- inst
|
|
*
|
|
* If we manage to do this successfully (we return true here), then flagging
|
|
* the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
|
|
* we will be able to pick up to merge into 'inst', leading to code like this:
|
|
*
|
|
* nop ; nop ; ldvary.r4
|
|
* nop ; fmul r0, r4, rf0 ; ldvary.r1
|
|
* fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst
|
|
*/
|
|
static bool
|
|
fixup_pipelined_ldvary(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qblock *block,
|
|
struct v3d_qpu_instr *inst)
|
|
{
|
|
/* We only call this if we have successfuly merged an ldvary into a
|
|
* previous instruction.
|
|
*/
|
|
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
|
|
assert(inst->sig.ldvary);
|
|
uint32_t ldvary_magic = inst->sig_magic;
|
|
uint32_t ldvary_index = inst->sig_addr;
|
|
|
|
/* The instruction in which we merged the ldvary cannot read
|
|
* the ldvary destination, if it does, then moving the ldvary before
|
|
* it would overwrite it.
|
|
*/
|
|
if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
|
|
return false;
|
|
if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
|
|
return false;
|
|
|
|
/* The implicit ldvary destination may not be written to by a signal
|
|
* in the instruction following ldvary. Since we are planning to move
|
|
* ldvary to the previous instruction, this means we need to check if
|
|
* the current instruction has any other signal that could create this
|
|
* conflict. The only other signal that can write to the implicit
|
|
* ldvary destination that is compatible with ldvary in the same
|
|
* instruction is ldunif.
|
|
*/
|
|
if (inst->sig.ldunif)
|
|
return false;
|
|
|
|
/* The previous instruction can't write to the same destination as the
|
|
* ldvary.
|
|
*/
|
|
struct qinst *prev = (struct qinst *) block->instructions.prev;
|
|
if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
|
|
return false;
|
|
|
|
if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
|
|
if (prev->qpu.alu.add.magic_write == ldvary_magic &&
|
|
prev->qpu.alu.add.waddr == ldvary_index) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
|
|
if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
|
|
prev->qpu.alu.mul.waddr == ldvary_index) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* The previous instruction cannot have a conflicting signal */
|
|
if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
|
|
return false;
|
|
|
|
uint32_t sig;
|
|
struct v3d_qpu_sig new_sig = prev->qpu.sig;
|
|
new_sig.ldvary = true;
|
|
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
|
|
return false;
|
|
|
|
/* The previous instruction cannot use flags since ldvary uses the
|
|
* 'cond' instruction field to store the destination.
|
|
*/
|
|
if (v3d_qpu_writes_flags(&prev->qpu))
|
|
return false;
|
|
if (v3d_qpu_reads_flags(&prev->qpu))
|
|
return false;
|
|
|
|
/* We can't put an ldvary in the delay slots of a thrsw. We should've
|
|
* prevented this when pairing up the ldvary with another instruction
|
|
* and flagging it for a fixup.
|
|
*/
|
|
assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
|
|
|
|
/* Move the ldvary to the previous instruction and remove it from the
|
|
* current one.
|
|
*/
|
|
prev->qpu.sig.ldvary = true;
|
|
prev->qpu.sig_magic = ldvary_magic;
|
|
prev->qpu.sig_addr = ldvary_index;
|
|
scoreboard->last_ldvary_tick = scoreboard->tick - 1;
|
|
|
|
inst->sig.ldvary = false;
|
|
inst->sig_magic = false;
|
|
inst->sig_addr = 0;
|
|
|
|
/* By moving ldvary to the previous instruction we make it update
|
|
* r5 in the current one, so nothing else in it should write r5.
|
|
* This should've been prevented by our depedency tracking, which
|
|
* would not allow ldvary to be paired up with an instruction that
|
|
* writes r5 (since our dependency tracking doesn't know that the
|
|
* ldvary write r5 happens in the next instruction).
|
|
*/
|
|
assert(!v3d_qpu_writes_r5(c->devinfo, inst));
|
|
|
|
return true;
|
|
}
|
|
|
|
static uint32_t
|
|
schedule_instructions(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qblock *block,
|
|
enum quniform_contents *orig_uniform_contents,
|
|
uint32_t *orig_uniform_data,
|
|
uint32_t *next_uniform)
|
|
{
|
|
const struct v3d_device_info *devinfo = c->devinfo;
|
|
uint32_t time = 0;
|
|
|
|
while (!list_is_empty(&scoreboard->dag->heads)) {
|
|
struct schedule_node *chosen =
|
|
choose_instruction_to_schedule(c, scoreboard, NULL);
|
|
struct schedule_node *merge = NULL;
|
|
|
|
/* If there are no valid instructions to schedule, drop a NOP
|
|
* in.
|
|
*/
|
|
struct qinst *qinst = chosen ? chosen->inst : vir_nop();
|
|
struct v3d_qpu_instr *inst = &qinst->qpu;
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "t=%4d: current list:\n",
|
|
time);
|
|
dump_state(devinfo, scoreboard->dag);
|
|
fprintf(stderr, "t=%4d: chose: ", time);
|
|
v3d_qpu_dump(devinfo, inst);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
/* We can't mark_instruction_scheduled() the chosen inst until
|
|
* we're done identifying instructions to merge, so put the
|
|
* merged instructions on a list for a moment.
|
|
*/
|
|
struct list_head merged_list;
|
|
list_inithead(&merged_list);
|
|
|
|
/* Schedule this instruction onto the QPU list. Also try to
|
|
* find an instruction to pair with it.
|
|
*/
|
|
if (chosen) {
|
|
time = MAX2(chosen->unblocked_time, time);
|
|
pre_remove_head(scoreboard->dag, chosen);
|
|
|
|
while ((merge =
|
|
choose_instruction_to_schedule(c, scoreboard,
|
|
chosen))) {
|
|
time = MAX2(merge->unblocked_time, time);
|
|
pre_remove_head(scoreboard->dag, merge);
|
|
list_addtail(&merge->link, &merged_list);
|
|
(void)qpu_merge_inst(devinfo, inst,
|
|
inst, &merge->inst->qpu);
|
|
if (merge->inst->uniform != -1) {
|
|
chosen->inst->uniform =
|
|
merge->inst->uniform;
|
|
}
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "t=%4d: merging: ",
|
|
time);
|
|
v3d_qpu_dump(devinfo, &merge->inst->qpu);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, " result: ");
|
|
v3d_qpu_dump(devinfo, inst);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
if (scoreboard->fixup_ldvary) {
|
|
scoreboard->fixup_ldvary = false;
|
|
if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
|
|
/* Flag the ldvary as scheduled
|
|
* now so we can try to merge the
|
|
* follow-up instruction in the
|
|
* the ldvary sequence into the
|
|
* current instruction.
|
|
*/
|
|
mark_instruction_scheduled(
|
|
devinfo, scoreboard->dag,
|
|
time, merge);
|
|
}
|
|
}
|
|
}
|
|
if (mux_read_stalls(scoreboard, inst))
|
|
c->qpu_inst_stalled_count++;
|
|
}
|
|
|
|
/* Update the uniform index for the rewritten location --
|
|
* branch target updating will still need to change
|
|
* c->uniform_data[] using this index.
|
|
*/
|
|
if (qinst->uniform != -1) {
|
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
|
|
block->branch_uniform = *next_uniform;
|
|
|
|
c->uniform_data[*next_uniform] =
|
|
orig_uniform_data[qinst->uniform];
|
|
c->uniform_contents[*next_uniform] =
|
|
orig_uniform_contents[qinst->uniform];
|
|
qinst->uniform = *next_uniform;
|
|
(*next_uniform)++;
|
|
}
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
/* Now that we've scheduled a new instruction, some of its
|
|
* children can be promoted to the list of instructions ready to
|
|
* be scheduled. Update the children's unblocked time for this
|
|
* DAG edge as we do so.
|
|
*/
|
|
mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
|
|
list_for_each_entry(struct schedule_node, merge, &merged_list,
|
|
link) {
|
|
mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
|
|
|
|
/* The merged VIR instruction doesn't get re-added to the
|
|
* block, so free it now.
|
|
*/
|
|
free(merge->inst);
|
|
}
|
|
|
|
if (inst->sig.thrsw) {
|
|
time += emit_thrsw(c, block, scoreboard, qinst, false);
|
|
} else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
|
emit_branch(c, block, scoreboard, qinst);
|
|
} else {
|
|
insert_scheduled_instruction(c, block,
|
|
scoreboard, qinst);
|
|
}
|
|
}
|
|
|
|
return time;
|
|
}
|
|
|
|
static uint32_t
|
|
qpu_schedule_instructions_block(struct v3d_compile *c,
|
|
struct choose_scoreboard *scoreboard,
|
|
struct qblock *block,
|
|
enum quniform_contents *orig_uniform_contents,
|
|
uint32_t *orig_uniform_data,
|
|
uint32_t *next_uniform)
|
|
{
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
scoreboard->dag = dag_create(mem_ctx);
|
|
struct list_head setup_list;
|
|
|
|
list_inithead(&setup_list);
|
|
|
|
/* Wrap each instruction in a scheduler structure. */
|
|
while (!list_is_empty(&block->instructions)) {
|
|
struct qinst *qinst = (struct qinst *)block->instructions.next;
|
|
struct schedule_node *n =
|
|
rzalloc(mem_ctx, struct schedule_node);
|
|
|
|
dag_init_node(scoreboard->dag, &n->dag);
|
|
n->inst = qinst;
|
|
|
|
list_del(&qinst->link);
|
|
list_addtail(&n->link, &setup_list);
|
|
}
|
|
|
|
calculate_forward_deps(c, scoreboard->dag, &setup_list);
|
|
calculate_reverse_deps(c, scoreboard->dag, &setup_list);
|
|
|
|
dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
|
|
|
|
uint32_t cycles = schedule_instructions(c, scoreboard, block,
|
|
orig_uniform_contents,
|
|
orig_uniform_data,
|
|
next_uniform);
|
|
|
|
ralloc_free(mem_ctx);
|
|
scoreboard->dag = NULL;
|
|
|
|
return cycles;
|
|
}
|
|
|
|
static void
|
|
qpu_set_branch_targets(struct v3d_compile *c)
|
|
{
|
|
vir_for_each_block(block, c) {
|
|
/* The end block of the program has no branch. */
|
|
if (!block->successors[0])
|
|
continue;
|
|
|
|
/* If there was no branch instruction, then the successor
|
|
* block must follow immediately after this one.
|
|
*/
|
|
if (block->branch_qpu_ip == ~0) {
|
|
assert(block->end_qpu_ip + 1 ==
|
|
block->successors[0]->start_qpu_ip);
|
|
continue;
|
|
}
|
|
|
|
/* Walk back through the delay slots to find the branch
|
|
* instr.
|
|
*/
|
|
struct qinst *branch = NULL;
|
|
struct list_head *entry = block->instructions.prev;
|
|
int32_t delay_slot_count = -1;
|
|
struct qinst *delay_slots_start = NULL;
|
|
for (int i = 0; i < 3; i++) {
|
|
entry = entry->prev;
|
|
struct qinst *inst =
|
|
container_of(entry, struct qinst, link);
|
|
|
|
if (delay_slot_count == -1) {
|
|
if (!v3d_qpu_is_nop(&inst->qpu))
|
|
delay_slot_count = i;
|
|
else
|
|
delay_slots_start = inst;
|
|
}
|
|
|
|
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
|
branch = inst;
|
|
break;
|
|
}
|
|
}
|
|
assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
|
|
assert(delay_slot_count >= 0 && delay_slot_count <= 3);
|
|
assert(delay_slot_count == 0 || delay_slots_start != NULL);
|
|
|
|
/* Make sure that the if-we-don't-jump
|
|
* successor was scheduled just after the
|
|
* delay slots.
|
|
*/
|
|
assert(!block->successors[1] ||
|
|
block->successors[1]->start_qpu_ip ==
|
|
block->branch_qpu_ip + 4);
|
|
|
|
branch->qpu.branch.offset =
|
|
((block->successors[0]->start_qpu_ip -
|
|
(block->branch_qpu_ip + 4)) *
|
|
sizeof(uint64_t));
|
|
|
|
/* Set up the relative offset to jump in the
|
|
* uniform stream.
|
|
*
|
|
* Use a temporary here, because
|
|
* uniform_data[inst->uniform] may be shared
|
|
* between multiple instructions.
|
|
*/
|
|
assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
|
|
c->uniform_data[branch->uniform] =
|
|
(block->successors[0]->start_uniform -
|
|
(block->branch_uniform + 1)) * 4;
|
|
|
|
/* If this is an unconditional branch, try to fill any remaining
|
|
* delay slots with the initial instructions of the successor
|
|
* block.
|
|
*
|
|
* FIXME: we can do the same for conditional branches if we
|
|
* predicate the instructions to match the branch condition.
|
|
*/
|
|
if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
|
|
struct list_head *successor_insts =
|
|
&block->successors[0]->instructions;
|
|
delay_slot_count = MIN2(delay_slot_count,
|
|
list_length(successor_insts));
|
|
struct qinst *s_inst =
|
|
(struct qinst *) successor_insts->next;
|
|
struct qinst *slot = delay_slots_start;
|
|
int slots_filled = 0;
|
|
while (slots_filled < delay_slot_count &&
|
|
qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
|
|
memcpy(&slot->qpu, &s_inst->qpu,
|
|
sizeof(slot->qpu));
|
|
s_inst = (struct qinst *) s_inst->link.next;
|
|
slot = (struct qinst *) slot->link.next;
|
|
slots_filled++;
|
|
}
|
|
branch->qpu.branch.offset +=
|
|
slots_filled * sizeof(uint64_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
|
{
|
|
const struct v3d_device_info *devinfo = c->devinfo;
|
|
struct qblock *end_block = list_last_entry(&c->blocks,
|
|
struct qblock, link);
|
|
|
|
/* We reorder the uniforms as we schedule instructions, so save the
|
|
* old data off and replace it.
|
|
*/
|
|
uint32_t *uniform_data = c->uniform_data;
|
|
enum quniform_contents *uniform_contents = c->uniform_contents;
|
|
c->uniform_contents = ralloc_array(c, enum quniform_contents,
|
|
c->num_uniforms);
|
|
c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
|
|
c->uniform_array_size = c->num_uniforms;
|
|
uint32_t next_uniform = 0;
|
|
|
|
struct choose_scoreboard scoreboard;
|
|
memset(&scoreboard, 0, sizeof(scoreboard));
|
|
scoreboard.last_ldvary_tick = -10;
|
|
scoreboard.last_unifa_write_tick = -10;
|
|
scoreboard.last_magic_sfu_write_tick = -10;
|
|
scoreboard.last_uniforms_reset_tick = -10;
|
|
scoreboard.last_thrsw_tick = -10;
|
|
scoreboard.last_branch_tick = -10;
|
|
scoreboard.last_setmsf_tick = -10;
|
|
scoreboard.last_stallable_sfu_tick = -10;
|
|
|
|
if (debug) {
|
|
fprintf(stderr, "Pre-schedule instructions\n");
|
|
vir_for_each_block(block, c) {
|
|
fprintf(stderr, "BLOCK %d\n", block->index);
|
|
list_for_each_entry(struct qinst, qinst,
|
|
&block->instructions, link) {
|
|
v3d_qpu_dump(devinfo, &qinst->qpu);
|
|
fprintf(stderr, "\n");
|
|
}
|
|
}
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
uint32_t cycles = 0;
|
|
vir_for_each_block(block, c) {
|
|
block->start_qpu_ip = c->qpu_inst_count;
|
|
block->branch_qpu_ip = ~0;
|
|
block->start_uniform = next_uniform;
|
|
|
|
cycles += qpu_schedule_instructions_block(c,
|
|
&scoreboard,
|
|
block,
|
|
uniform_contents,
|
|
uniform_data,
|
|
&next_uniform);
|
|
|
|
block->end_qpu_ip = c->qpu_inst_count - 1;
|
|
}
|
|
|
|
/* Emit the program-end THRSW instruction. */;
|
|
struct qinst *thrsw = vir_nop();
|
|
thrsw->qpu.sig.thrsw = true;
|
|
emit_thrsw(c, end_block, &scoreboard, thrsw, true);
|
|
|
|
qpu_set_branch_targets(c);
|
|
|
|
assert(next_uniform == c->num_uniforms);
|
|
|
|
return cycles;
|
|
}
|