broadcom/compiler: be more flexible scheduling TMU writes

V3D 4.x allows more flexibility, so take advantage of that. Generally,
we can reorder any writes in the same sequence, so long as they are
not the sequence terminator (which must always be last, since it is
the one triggering the operation), and TMUD writes, since these must
be ordered with respect to each other.

total instructions in shared programs: 13735183 -> 13731927 (-0.02%)
instructions in affected programs: 903057 -> 899801 (-0.36%)
helped: 2358
HURT: 746
Instructions are helped.

total max-temps in shared programs: 2322020 -> 2322009 (<.01%)
max-temps in affected programs: 619 -> 608 (-1.78%)
helped: 19
HURT: 11
Inconclusive result (value mean confidence interval includes 0).

total sfu-stalls in shared programs: 31494 -> 31489 (-0.02%)
sfu-stalls in affected programs: 182 -> 177 (-2.75%)
helped: 40
HURT: 40
Inconclusive result (value mean confidence interval includes 0).

total inst-and-stalls in shared programs: 13766677 -> 13763416 (-0.02%)
inst-and-stalls in affected programs: 901343 -> 898082 (-0.36%)
helped: 2349
HURT: 746
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9555>
This commit is contained in:
Iago Toral Quiroga 2021-03-12 09:54:49 +01:00
parent 87ed614c47
commit 177dcd4b68

View file

@ -168,6 +168,36 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
switch (waddr) {
case V3D_QPU_WADDR_TMUS:
case V3D_QPU_WADDR_TMUSCM:
case V3D_QPU_WADDR_TMUSF:
case V3D_QPU_WADDR_TMUSLOD:
case V3D_QPU_WADDR_TMUA:
case V3D_QPU_WADDR_TMUAU:
return true;
default:
return false;
}
}
static bool
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
{
if (devinfo->ver < 40)
return false;
if (tmu_write_is_sequence_terminator(waddr))
return false;
if (waddr == V3D_QPU_WADDR_TMUD)
return false;
return true;
}
static void
process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
@ -176,22 +206,13 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
if (!magic) {
add_write_dep(state, &state->last_rf[waddr], n);
} else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
/* XXX perf: For V3D 4.x, we could reorder TMU writes other
* than the TMUS/TMUD/TMUA to improve scheduling flexibility.
*/
add_write_dep(state, &state->last_tmu_write, n);
switch (waddr) {
case V3D_QPU_WADDR_TMUS:
case V3D_QPU_WADDR_TMUSCM:
case V3D_QPU_WADDR_TMUSF:
case V3D_QPU_WADDR_TMUSLOD:
case V3D_QPU_WADDR_TMUA:
case V3D_QPU_WADDR_TMUAU:
if (can_reorder_tmu_write(state->devinfo, waddr))
add_read_dep(state, state->last_tmu_write, n);
else
add_write_dep(state, &state->last_tmu_write, n);
if (tmu_write_is_sequence_terminator(waddr))
add_write_dep(state, &state->last_tmu_config, n);
break;
default:
break;
}
} else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
/* Handled by v3d_qpu_writes_r4() check. */
} else {