nir/opt_varyings: fix mesh shader miss promote varying to flat

We still allow mesh shader promote constant output to flat, but mesh shader like geometry shader may store multi vertices' varying in a single thread. So mesh shader may store different constant values to different vertices in a single thread, we should not promote this case to flat. I'm not using shader_info.mesh.ms_cross_invocation_output_access because OpenGL does not require IO to have explicit location, so when nir_shader_gather_info is called in OpenGL GLSL compiler to compute ms_cross_invocation_output_access, some implicit output has -1 location which causes ms_cross_invocation_output_access unset for it. Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13134 Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35081> (cherry picked from commit 6f2a1e19da)
2026-05-07 04:58:05 +02:00 · 2025-05-16 09:55:01 +08:00 · 2025-05-16 09:55:01 +08:00 · e07cea0be5
commit e07cea0be5
parent 042736a4d4
2 changed files with 41 additions and 22 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -1664,7 +1664,7 @@
        "description": "nir/opt_varyings: fix mesh shader miss promote varying to flat",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/compiler/nir/nir_opt_varyings.c
+++ b/src/compiler/nir/nir_opt_varyings.c
@ -43,7 +43,7 @@
 *
 * When an output stores an SSA that is convergent and all stores of that
 * output appear in unconditional blocks or conditional blocks with
- * a convergent entry condition and the shader is not GS, it implies that all
+ * a convergent entry condition and the shader is not GS or MS, it implies that all
 * vertices of that output have the same value, therefore the output can be
 * promoted to flat because all interpolation modes lead to the same result
 * as flat. Such outputs are opportunistically compacted with both flat and
@ -692,9 +692,9 @@ struct linkage_info {
   BITSET_DECLARE(xfb32_only_mask, NUM_SCALAR_SLOTS);
   BITSET_DECLARE(xfb16_only_mask, NUM_SCALAR_SLOTS);

-   /* Mask of all TCS inputs using cross-invocation access. */
-   BITSET_DECLARE(tcs_cross_invoc32_mask, NUM_SCALAR_SLOTS);
-   BITSET_DECLARE(tcs_cross_invoc16_mask, NUM_SCALAR_SLOTS);
+   /* Mask of all TCS inputs or MS outputs using cross-invocation access. */
+   BITSET_DECLARE(cross_invoc32_mask, NUM_SCALAR_SLOTS);
+   BITSET_DECLARE(cross_invoc16_mask, NUM_SCALAR_SLOTS);

   /* Mask of all TCS->TES slots that are read by TCS, but not TES. */
   BITSET_DECLARE(no_varying32_mask, NUM_SCALAR_SLOTS);
@ -794,8 +794,8 @@ print_linkage(struct linkage_info *linkage)
          !BITSET_TEST(linkage->indirect_mask, i) &&
          !BITSET_TEST(linkage->xfb32_only_mask, i) &&
          !BITSET_TEST(linkage->xfb16_only_mask, i) &&
-          !BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) &&
-          !BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) &&
+          !BITSET_TEST(linkage->cross_invoc32_mask, i) &&
+          !BITSET_TEST(linkage->cross_invoc16_mask, i) &&
          !BITSET_TEST(linkage->no_varying32_mask, i) &&
          !BITSET_TEST(linkage->no_varying16_mask, i) &&
          !BITSET_TEST(linkage->interp_fp32_mask, i) &&
@ -827,8 +827,8 @@ print_linkage(struct linkage_info *linkage)
             BITSET_TEST(linkage->indirect_mask, i) ? " indirect" : "",
             BITSET_TEST(linkage->xfb32_only_mask, i) ? " xfb32_only" : "",
             BITSET_TEST(linkage->xfb16_only_mask, i) ? " xfb16_only" : "",
-             BITSET_TEST(linkage->tcs_cross_invoc32_mask, i) ? " tcs_cross_invoc32" : "",
-             BITSET_TEST(linkage->tcs_cross_invoc16_mask, i) ? " tcs_cross_invoc16" : "",
+             BITSET_TEST(linkage->cross_invoc32_mask, i) ? " cross_invoc32" : "",
+             BITSET_TEST(linkage->cross_invoc16_mask, i) ? " cross_invoc16" : "",
             BITSET_TEST(linkage->no_varying32_mask, i) ? " no_varying32" : "",
             BITSET_TEST(linkage->no_varying16_mask, i) ? " no_varying16" : "",
             BITSET_TEST(linkage->interp_fp32_mask, i) ? " interp_fp32" : "",
@ -887,8 +887,8 @@ slot_disable_optimizations_and_compaction(struct linkage_info *linkage,
   BITSET_CLEAR(linkage->interp_explicit_strict16_mask, i);
   BITSET_CLEAR(linkage->per_primitive32_mask, i);
   BITSET_CLEAR(linkage->per_primitive16_mask, i);
-   BITSET_CLEAR(linkage->tcs_cross_invoc32_mask, i);
-   BITSET_CLEAR(linkage->tcs_cross_invoc16_mask, i);
+   BITSET_CLEAR(linkage->cross_invoc32_mask, i);
+   BITSET_CLEAR(linkage->cross_invoc16_mask, i);
   BITSET_CLEAR(linkage->no_varying32_mask, i);
   BITSET_CLEAR(linkage->no_varying16_mask, i);
   BITSET_CLEAR(linkage->color32_mask, i);
@ -1468,9 +1468,9 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d

         if (!is_sysval(vertex_index_instr, SYSTEM_VALUE_INVOCATION_ID)) {
            if (intr->def.bit_size == 32)
-               BITSET_SET(linkage->tcs_cross_invoc32_mask, slot);
+               BITSET_SET(linkage->cross_invoc32_mask, slot);
            else if (intr->def.bit_size == 16)
-               BITSET_SET(linkage->tcs_cross_invoc16_mask, slot);
+               BITSET_SET(linkage->cross_invoc16_mask, slot);
            else
               unreachable("invalid load_input type");
         }
@ -1642,6 +1642,21 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
               unreachable("invalid store_output type");
         }
      }
+
+      if (linkage->producer_stage == MESA_SHADER_MESH &&
+          intr->intrinsic == nir_intrinsic_store_per_vertex_output) {
+         nir_src *vertex_index_src = nir_get_io_arrayed_index_src(intr);
+         nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+
+         if (!is_sysval(vertex_index_instr, SYSTEM_VALUE_INVOCATION_ID)) {
+            if (value->bit_size == 32)
+               BITSET_SET(linkage->cross_invoc32_mask, slot);
+            else if (value->bit_size == 16)
+               BITSET_SET(linkage->cross_invoc16_mask, slot);
+            else
+               unreachable("invalid store_output type");
+         }
+      }
   } else {
      /* Only TCS output loads can get here.
       *
@ -1745,7 +1760,7 @@ tidy_up_convergent_varyings(struct linkage_info *linkage)
       * bit and keep the convergent bit, which means that it's interpolated,
       * but can be promoted to flat.
       *
-       * Since the geometry shader is the only shader that can store values
+       * Since the geometry shader and mesh shader can store values
       * in multiple vertices before FS, it's required that all stores are
       * equal to be considered convergent (output_equal_mask), otherwise
       * the promotion to flat would be incorrect.
@ -1760,7 +1775,9 @@ tidy_up_convergent_varyings(struct linkage_info *linkage)
            BITSET_CLEAR(linkage->convergent32_mask, i);
         } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
                     BITSET_TEST(linkage->flat32_mask, i)) ||
-                    (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
+                    ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
+                      (linkage->producer_stage == MESA_SHADER_MESH &&
+                       BITSET_TEST(linkage->cross_invoc32_mask, i))) &&
                     !BITSET_TEST(linkage->output_equal_mask, i))) {
            /* Keep the original qualifier. */
            BITSET_CLEAR(linkage->convergent32_mask, i);
@ -1784,7 +1801,9 @@ tidy_up_convergent_varyings(struct linkage_info *linkage)
            BITSET_CLEAR(linkage->convergent16_mask, i);
         } else if ((!linkage->can_mix_convergent_flat_with_interpolated &&
                     BITSET_TEST(linkage->flat16_mask, i)) ||
-                    (linkage->producer_stage == MESA_SHADER_GEOMETRY &&
+                    ((linkage->producer_stage == MESA_SHADER_GEOMETRY ||
+                      (linkage->producer_stage == MESA_SHADER_MESH &&
+                       BITSET_TEST(linkage->cross_invoc16_mask, i))) &&
                     !BITSET_TEST(linkage->output_equal_mask, i))) {
            /* Keep the original qualifier. */
            BITSET_CLEAR(linkage->convergent16_mask, i);
@ -4964,18 +4983,18 @@ compact_varyings(struct linkage_info *linkage,
                                  : VARYING_SLOT_VAR0) * 8;

   if (linkage->consumer_stage == MESA_SHADER_TESS_CTRL) {
-      /* Make tcs_cross_invoc*_mask bits disjoint with flat*_mask bits
-       * because tcs_cross_invoc*_mask is initially a subset of flat*_mask,
+      /* Make cross_invoc*_mask bits disjoint with flat*_mask bits
+       * because cross_invoc*_mask is initially a subset of flat*_mask,
       * but we must assign each scalar slot only once.
       */
      BITSET_ANDNOT(linkage->flat32_mask, linkage->flat32_mask,
-                    linkage->tcs_cross_invoc32_mask);
+                    linkage->cross_invoc32_mask);
      BITSET_ANDNOT(linkage->flat16_mask, linkage->flat16_mask,
-                    linkage->tcs_cross_invoc16_mask);
+                    linkage->cross_invoc16_mask);

      /* Put cross-invocation-accessed TCS inputs first. */
-      vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->tcs_cross_invoc32_mask,
-                                       linkage->tcs_cross_invoc16_mask,
+      vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->cross_invoc32_mask,
+                                       linkage->cross_invoc16_mask,
                                       &slot_index, NULL, progress);
      /* Remaining TCS inputs. */
      vs_tcs_tes_gs_assign_slots_2sets(linkage, linkage->flat32_mask,