broadcom/compiler: be more flexible scheduling TMU writes

V3D 4.x allows more flexibility, so take advantage of that. Generally, we can reorder any writes in the same sequence, so long as they are not the sequence terminator (which must always be last, since it is the one triggering the operation), and TMUD writes, since these must be ordered with respect to each other. total instructions in shared programs: 13735183 -> 13731927 (-0.02%) instructions in affected programs: 903057 -> 899801 (-0.36%) helped: 2358 HURT: 746 Instructions are helped. total max-temps in shared programs: 2322020 -> 2322009 (<.01%) max-temps in affected programs: 619 -> 608 (-1.78%) helped: 19 HURT: 11 Inconclusive result (value mean confidence interval includes 0). total sfu-stalls in shared programs: 31494 -> 31489 (-0.02%) sfu-stalls in affected programs: 182 -> 177 (-2.75%) helped: 40 HURT: 40 Inconclusive result (value mean confidence interval includes 0). total inst-and-stalls in shared programs: 13766677 -> 13763416 (-0.02%) inst-and-stalls in affected programs: 901343 -> 898082 (-0.36%) helped: 2349 HURT: 746 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9555>
2026-05-09 04:38:03 +02:00 · 2021-03-12 09:54:49 +01:00 · 2021-03-12 09:54:49 +01:00 · 177dcd4b68
commit 177dcd4b68
parent 87ed614c47
1 changed files with 36 additions and 15 deletions
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@ -168,6 +168,36 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
        }
 }

+static bool
+tmu_write_is_sequence_terminator(uint32_t waddr)
+{
+        switch (waddr) {
+        case V3D_QPU_WADDR_TMUS:
+        case V3D_QPU_WADDR_TMUSCM:
+        case V3D_QPU_WADDR_TMUSF:
+        case V3D_QPU_WADDR_TMUSLOD:
+        case V3D_QPU_WADDR_TMUA:
+        case V3D_QPU_WADDR_TMUAU:
+                return true;
+        default:
+                return false;
+        }
+}
+
+static bool
+can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
+{
+        if (devinfo->ver < 40)
+                return false;
+
+        if (tmu_write_is_sequence_terminator(waddr))
+                return false;
+
+        if (waddr == V3D_QPU_WADDR_TMUD)
+                return false;
+
+        return true;
+}

 static void
 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
@ -176,22 +206,13 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
        if (!magic) {
                add_write_dep(state, &state->last_rf[waddr], n);
        } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
-                /* XXX perf: For V3D 4.x, we could reorder TMU writes other
-                 * than the TMUS/TMUD/TMUA to improve scheduling flexibility.
-                 */
-                add_write_dep(state, &state->last_tmu_write, n);
-                switch (waddr) {
-                case V3D_QPU_WADDR_TMUS:
-                case V3D_QPU_WADDR_TMUSCM:
-                case V3D_QPU_WADDR_TMUSF:
-                case V3D_QPU_WADDR_TMUSLOD:
-                case V3D_QPU_WADDR_TMUA:
-                case V3D_QPU_WADDR_TMUAU:
+                if (can_reorder_tmu_write(state->devinfo, waddr))
+                        add_read_dep(state, state->last_tmu_write, n);
+                else
+                        add_write_dep(state, &state->last_tmu_write, n);
+
+                if (tmu_write_is_sequence_terminator(waddr))
                        add_write_dep(state, &state->last_tmu_config, n);
-                        break;
-                default:
-                        break;
-                }
        } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
                /* Handled by v3d_qpu_writes_r4() check. */
        } else {