From d8a25bdb0717efa16f65bf019b692aec3e2973e4 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 27 Oct 2021 11:35:12 +0200
Subject: [PATCH] broadcom/compiler: enable ldvary pipelining on v71
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25450>
---
 src/broadcom/compiler/qpu_schedule.c | 123 ++++++++++++++++++---------
 1 file changed, 81 insertions(+), 42 deletions(-)

diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 100ffca12ae..f0d59973e66 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -2312,46 +2312,72 @@ emit_branch(struct v3d_compile *c,
 }
 
 static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+                   struct v3d_qpu_instr *inst,
                    bool add, bool magic, uint32_t index)
 {
         uint32_t num_src;
-        enum v3d_qpu_mux mux_a, mux_b;
-
-        if (add) {
+        if (add)
                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
-                mux_a = inst->alu.add.a.mux;
-                mux_b = inst->alu.add.b.mux;
-        } else {
+        else
                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
-                mux_a = inst->alu.mul.a.mux;
-                mux_b = inst->alu.mul.b.mux;
+
+        if (devinfo->ver <= 42) {
+                enum v3d_qpu_mux mux_a, mux_b;
+                if (add) {
+                        mux_a = inst->alu.add.a.mux;
+                        mux_b = inst->alu.add.b.mux;
+                } else {
+                        mux_a = inst->alu.mul.a.mux;
+                        mux_b = inst->alu.mul.b.mux;
+                }
+
+                for (int i = 0; i < num_src; i++) {
+                        if (magic) {
+                                if (i == 0 && mux_a == index)
+                                        return true;
+                                if (i == 1 && mux_b == index)
+                                        return true;
+                        } else {
+                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
+                        }
+                }
+
+                return false;
+        }
+
+        assert(devinfo->ver >= 71);
+        assert(!magic);
+
+        uint32_t raddr_a, raddr_b;
+        if (add) {
+                raddr_a = inst->alu.add.a.raddr;
+                raddr_b = inst->alu.add.b.raddr;
+        } else {
+                raddr_a = inst->alu.mul.a.raddr;
+                raddr_b = inst->alu.mul.b.raddr;
         }
 
         for (int i = 0; i < num_src; i++) {
-                if (magic) {
-                        if (i == 0 && mux_a == index)
-                                return true;
-                        if (i == 1 && mux_b == index)
-                                return true;
-                } else {
-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
-                        }
-                }
+                if (i == 0 && raddr_a == index)
+                        return true;
+                if (i == 1 && raddr_b == index)
+                        return true;
         }
 
         return false;
@@ -2386,6 +2412,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
                        struct qblock *block,
                        struct v3d_qpu_instr *inst)
 {
+        const struct v3d_device_info *devinfo = c->devinfo;
+
         /* We only call this if we have successfully merged an ldvary into a
          * previous instruction.
          */
@@ -2398,9 +2426,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
          * the ldvary destination, if it does, then moving the ldvary before
          * it would overwrite it.
          */
-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
                 return false;
-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
                 return false;
 
         /* The implicit ldvary destination may not be written to by a signal
@@ -2436,13 +2464,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         }
 
         /* The previous instruction cannot have a conflicting signal */
-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
                 return false;
 
         uint32_t sig;
         struct v3d_qpu_sig new_sig = prev->qpu.sig;
         new_sig.ldvary = true;
-        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
+        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
                 return false;
 
         /* The previous instruction cannot use flags since ldvary uses the
@@ -2471,14 +2499,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         inst->sig_magic = false;
         inst->sig_addr = 0;
 
-        /* By moving ldvary to the previous instruction we make it update
-         * r5 in the current one, so nothing else in it should write r5.
-         * This should've been prevented by our dependency tracking, which
+        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+        if (devinfo->ver >= 71) {
+                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+        }
+
+        /* By moving ldvary to the previous instruction we make it update r5
+         * (rf0 for ver >= 71) in the current one, so nothing else in it
+         * should write this register.
+         *
+         * This should've been prevented by our depedency tracking, which
          * would not allow ldvary to be paired up with an instruction that
-         * writes r5 (since our dependency tracking doesn't know that the
-         * ldvary write r5 happens in the next instruction).
+         * writes r5/rf0 (since our dependency tracking doesn't know that the
+         * ldvary write to r5/rf0 happens in the next instruction).
          */
-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+        assert(!v3d_qpu_writes_r5(devinfo, inst));
+        assert(devinfo->ver <= 42 ||
+               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
 
         return true;
 }