broadcom/compiler: always restart ldvary pipelining when scheduling ldvary

When we were only able to pipeline smooth varyings, if we had to disable ldvary pipelining in the middle of a sequence it would stay disabled for the rest of the program, to prevent us from prioritizing scheduling of ldvary instructions that we would not be able to pipeline effectively. Now that we can pipeline all ldvary sequences we can change this. This change re-enables ldvary pipelining upon finding the next ldvary in the program in the hopes that we can continue pipelining succesfully. To do this, we track the number of ldvary instructions we emitted so far and compare that to the number of inputs in the fragment shader we are scheduling. This also allows us to simplify our ldvary tracking at nir to vir time, since that is all now handled in the QPU scheduler. total instructions in shared programs: 13817048 -> 13810783 (-0.05%) instructions in affected programs: 810114 -> 803849 (-0.77%) helped: 4843 HURT: 591 Instructions are helped. total max-temps in shared programs: 2326612 -> 2326300 (-0.01%) max-temps in affected programs: 4689 -> 4377 (-6.65%) helped: 285 HURT: 7 Max-temps are helped. total sfu-stalls in shared programs: 30942 -> 30865 (-0.25%) sfu-stalls in affected programs: 207 -> 130 (-37.20%) helped: 120 HURT: 42 Sfu-stalls are helped. total inst-and-stalls in shared programs: 13847990 -> 13841648 (-0.05%) inst-and-stalls in affected programs: 825378 -> 819036 (-0.77%) helped: 4899 HURT: 590 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9404>
2026-05-05 07:28:11 +02:00 · 2021-03-04 09:21:53 +01:00 · 2021-03-04 09:21:53 +01:00 · 839007e490
commit 839007e490
parent 2169c4f763
3 changed files with 56 additions and 113 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -928,78 +928,33 @@ ldvary_sequence_inst(struct v3d_compile *c, struct qreg result)
                   (struct qinst *) c->cur_block->instructions.prev;
        assert(producer);
        producer->is_ldvary_sequence = true;
-        c->ldvary_sequence_end_inst = producer;
        return result;
 }

-static void
-track_ldvary_pipelining(struct v3d_compile *c, struct qinst *ldvary)
-{
-        if (ldvary) {
-                ldvary->is_ldvary_sequence = true;
-                c->ldvary_sequence_length++;
-                if (c->ldvary_sequence_length == 1) {
-                        ldvary->ldvary_pipelining_start = true;
-                        c->ldvary_sequence_start_inst = ldvary;
-                }
-        }
-}
-
 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
-                    struct qinst *ldvary,
                    struct qreg vary, struct qreg w, struct qreg r5)
 {
-        track_ldvary_pipelining(c, ldvary);
        return ldvary_sequence_inst(c, vir_FADD(c,
               ldvary_sequence_inst(c, vir_FMUL(c, vary, w)), r5));
 }

 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
-                           struct qinst *ldvary,
                           struct qreg vary, struct qreg r5)
 {
-        track_ldvary_pipelining(c, ldvary);
        return ldvary_sequence_inst(c, vir_FADD(c,
               ldvary_sequence_inst(c, vir_MOV(c, vary)), r5));
 }

 static struct qreg
 emit_flat_varying(struct v3d_compile *c,
-                  struct qinst *ldvary,
                  struct qreg vary, struct qreg r5)
 {
-        track_ldvary_pipelining(c, ldvary);
        vir_MOV_dest(c, c->undef, vary);
        return ldvary_sequence_inst(c, vir_MOV(c, r5));
 }

-static void
-varying_sequence_end(struct v3d_compile *c)
-{
-        if (!c->ldvary_sequence_start_inst) {
-                assert(!c->ldvary_sequence_end_inst);
-                assert(c->ldvary_sequence_length == 0);
-                return;
-        }
-
-        assert(c->ldvary_sequence_start_inst);
-        assert(c->ldvary_sequence_end_inst);
-        assert(c->ldvary_sequence_start_inst != c->ldvary_sequence_end_inst);
-
-        /* We need at least two ldvary sequences to do some pipelining */
-        if (c->ldvary_sequence_length == 1)
-                c->ldvary_sequence_start_inst->ldvary_pipelining_start = false;
-
-        if (c->ldvary_sequence_length > 1)
-                c->ldvary_sequence_end_inst->ldvary_pipelining_end = true;
-
-        c->ldvary_sequence_length = 0;
-        c->ldvary_sequence_start_inst = NULL;
-        c->ldvary_sequence_end_inst = NULL;
-}
-
 static struct qreg
 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                      int8_t input_idx, uint8_t swizzle, int array_index)
@ -1013,6 +968,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                      c->undef, c->undef);
                ldvary->qpu.sig.ldvary = true;
+                ldvary->is_ldvary_sequence = true;
                vary = vir_emit_def(c, ldvary);
        } else {
                vir_NOP(c)->qpu.sig.ldvary = true;
@ -1035,7 +991,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
         */
        if (!var) {
                assert(input_idx < 0);
-                return emit_smooth_varying(c, ldvary, vary, c->payload_w, r5);
+                return emit_smooth_varying(c, vary, c->payload_w, r5);
        }

        int i = c->num_inputs++;
@ -1049,22 +1005,21 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
        case INTERP_MODE_SMOOTH:
                if (var->data.centroid) {
                        BITSET_SET(c->centroid_flags, i);
-                        result = emit_smooth_varying(c, ldvary, vary,
+                        result = emit_smooth_varying(c, vary,
                                                     c->payload_w_centroid, r5);
                } else {
-                        result = emit_smooth_varying(c, ldvary, vary,
-                                                     c->payload_w, r5);
+                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
                }
                break;

        case INTERP_MODE_NOPERSPECTIVE:
                BITSET_SET(c->noperspective_flags, i);
-                result = emit_noperspective_varying(c, ldvary, vary, r5);
+                result = emit_noperspective_varying(c, vary, r5);
                break;

        case INTERP_MODE_FLAT:
                BITSET_SET(c->flat_shade_flags, i);
-                result = emit_flat_varying(c, ldvary, vary, r5);
+                result = emit_flat_varying(c, vary, r5);
                break;

        default:
@ -2099,8 +2054,6 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
                        }
                }
        }
-
-        varying_sequence_end(c);
 }

 static void
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@ -461,6 +461,7 @@ struct choose_scoreboard {
        bool tlb_locked;
        bool ldvary_pipelining;
        bool fixup_ldvary;
+        int ldvary_count;
 };

 static bool
@ -875,7 +876,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
 }

 static struct schedule_node *
-choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
+choose_instruction_to_schedule(struct v3d_compile *c,
                               struct choose_scoreboard *scoreboard,
                               struct schedule_node *prev_inst)
 {
@ -900,12 +901,6 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                        continue;
                }

-                /* Sanity check: if we are scheduling a smooth ldvary sequence
-                 * we cannot be starting another sequence in the middle of it.
-                 */
-                assert(!scoreboard->ldvary_pipelining ||
-                       !n->inst->ldvary_pipelining_start);
-
                const struct v3d_qpu_instr *inst = &n->inst->qpu;

                /* Simulator complains if we have two uniforms loaded in the
@ -947,7 +942,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                if (reads_too_soon_after_write(scoreboard, n->inst))
                        continue;

-                if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
+                if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                        continue;

                /* "A scoreboard wait must not occur in the first two
@ -991,7 +986,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                                continue;

                        struct v3d_qpu_instr merged_inst;
-                        if (!qpu_merge_inst(devinfo, &merged_inst,
+                        if (!qpu_merge_inst(c->devinfo, &merged_inst,
                                            &prev_inst->inst->qpu, inst)) {
                                continue;
                        }
@ -1002,12 +997,13 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                         */
                        if (scoreboard->ldvary_pipelining && inst->sig.ldvary) {
                                assert(n->inst->is_ldvary_sequence);
+                                scoreboard->ldvary_count++;
                                scoreboard->fixup_ldvary = true;
                                return n;
                        }
                }

-                int prio = get_instruction_priority(devinfo, inst);
+                int prio = get_instruction_priority(c->devinfo, inst);

                if (mux_read_stalls(scoreboard, inst)) {
                        /* Don't merge an instruction that stalls */
@ -1045,39 +1041,49 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
                }
        }

-        /* If we are in the middle of an ldvary sequence we only pick up
-         * instructions that can continue the sequence so we can pipeline
-         * them, however, if we failed to find anything to schedule then we
-         * can't possibly continue the sequence and we need to stop the
-         * pipelining process and try again.
-         *
-         * There is one exception to the above: noperspective or flat
-         * varyings can cause us to not be able to pick an instruction
-         * because they need a nop between the ldvary and the next instruction
-         * to account for the ldvary r5 write latency. We can try to detect this
-         * by checking if we are also unable to schedule an instruction after
-         * disabling pipelining.
-         *
-         * FIXME: dropping pipelining and picking up another instruction could
-         * break the sequence for flat/noperspective varyings we could've been
-         * able to continue if we returned NULL here and scheduled a NOP as a
-         * result, but detecting this case would require us to know in advance
-         * that emitting the next NOP will guarantee that we will be able to
-         * continue the sequence.
-         */
-        if (scoreboard->ldvary_pipelining && !prev_inst && !chosen) {
-                scoreboard->ldvary_pipelining = false;
-                chosen = choose_instruction_to_schedule(devinfo, scoreboard,
-                                                        prev_inst);
-                scoreboard->ldvary_pipelining = !chosen;
-        } else if (chosen) {
-                if (scoreboard->ldvary_pipelining) {
-                        assert(chosen->inst->is_ldvary_sequence);
+        /* Update ldvary pipelining state */
+        if (chosen) {
+                if (chosen->inst->qpu.sig.ldvary &&
+                    chosen->inst->is_ldvary_sequence) {
                        scoreboard->ldvary_pipelining =
-                                !chosen->inst->ldvary_pipelining_end;
-                } else if (chosen->inst->ldvary_pipelining_start) {
-                        assert(chosen->inst->qpu.sig.ldvary);
-                        scoreboard->ldvary_pipelining = true;
+                            c->num_inputs > ++scoreboard->ldvary_count;
+                }
+        } else if (scoreboard->ldvary_pipelining) {
+                /* If we are in the middle of an ldvary sequence we only pick
+                 * up instructions that can continue the sequence so we can
+                 * pipeline them, however, if we failed to find anything to
+                 * schedule (!prev_inst) then we can't possibly continue the
+                 * sequence and we need to stop the pipelining process and try
+                 * again.
+                 *
+                 * There is one exception to the above: noperspective or flat
+                 * varyings can cause us to not be able to pick an instruction
+                 * because they need a nop between the ldvary and the next
+                 * instruction to account for the ldvary r5 write latency. We
+                 * can try to detect this by checking if we are also unable to
+                 * schedule an instruction after disabling pipelining.
+                 *
+                 * FIXME: dropping pipelining and picking up another instruction
+                 * could break the sequence for flat/noperspective varyings we
+                 * could've been able to continue if we returned NULL here and
+                 * scheduled a NOP as a result, but detecting this case would
+                 * require us to know in advance that emitting the next NOP will
+                 * guarantee that we will be able to continue the sequence.
+                 *
+                 * If we failed to pair up (prev_inst != NULL), then we disable
+                 * pipelining if we have already scheduled the last ldvary. This
+                 * may allow any other instruction that is not part of an ldvary
+                 * sequence to be merged into the last instruction of the last
+                 * ldvary sequence for optimal results.
+                 */
+                if (!prev_inst) {
+                        scoreboard->ldvary_pipelining = false;
+                        chosen = choose_instruction_to_schedule(c, scoreboard,
+                                                                prev_inst);
+                        scoreboard->ldvary_pipelining = !chosen;
+                } else {
+                        scoreboard->ldvary_pipelining =
+                                c->num_inputs > scoreboard->ldvary_count;
                }
        }

@ -1667,9 +1673,7 @@ schedule_instructions(struct v3d_compile *c,

        while (!list_is_empty(&scoreboard->dag->heads)) {
                struct schedule_node *chosen =
-                        choose_instruction_to_schedule(devinfo,
-                                                       scoreboard,
-                                                       NULL);
+                        choose_instruction_to_schedule(c, scoreboard, NULL);
                struct schedule_node *merge = NULL;

                /* If there are no valid instructions to schedule, drop a NOP
@ -1702,8 +1706,7 @@ schedule_instructions(struct v3d_compile *c,
                        pre_remove_head(scoreboard->dag, chosen);

                        while ((merge =
-                                choose_instruction_to_schedule(devinfo,
-                                                               scoreboard,
+                                choose_instruction_to_schedule(c, scoreboard,
                                                               chosen))) {
                                time = MAX2(merge->unblocked_time, time);
                                pre_remove_head(scoreboard->dag, merge);
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -165,14 +165,6 @@ struct qinst {

        /* Set if this instruction participates in a varying setup. */
        bool is_ldvary_sequence;
-        /* Set if this is the ldvary instruction starting a sequence of
-         * varyings we want to pipeline.
-         */
-        bool ldvary_pipelining_start;
-        /* Set if this is the last instruction involved with a pipelineable
-         * varying sequence.
-         */
-        bool ldvary_pipelining_end;
 };

 enum quniform_contents {
@ -780,11 +772,6 @@ struct v3d_compile {
        uint32_t program_id;
        uint32_t variant_id;

-        /* Used to track pipelinable sequences of varyings */
-        struct qinst *ldvary_sequence_start_inst;
-        struct qinst *ldvary_sequence_end_inst;
-        uint32_t ldvary_sequence_length;
-
        /* Set to compile program in in 1x, 2x, or 4x threaded mode, where
         * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
         * limiting ourselves to the part of the physical reg space.