nir/opt_varyings: back propagate signed zero information to outputs

Foz-DB Navi48: Totals from 809 (0.67% of 120695) affected shaders: MaxWaves: 21804 -> 21808 (+0.02%) Instrs: 863131 -> 861310 (-0.21%); split: -0.22%, +0.01% CodeSize: 4535500 -> 4523232 (-0.27%); split: -0.30%, +0.03% VGPRs: 47304 -> 47280 (-0.05%) SpillSGPRs: 170 -> 82 (-51.76%) Latency: 6791484 -> 6786880 (-0.07%); split: -0.07%, +0.00% InvThroughput: 906281 -> 905301 (-0.11%); split: -0.11%, +0.00% VClause: 16910 -> 16917 (+0.04%); split: -0.01%, +0.05% SClause: 21856 -> 21827 (-0.13%); split: -0.14%, +0.01% Copies: 61890 -> 61436 (-0.73%); split: -0.80%, +0.06% Branches: 19725 -> 19640 (-0.43%) PreSGPRs: 38011 -> 37851 (-0.42%) PreVGPRs: 36482 -> 36454 (-0.08%) VALU: 465316 -> 464323 (-0.21%); split: -0.22%, +0.00% SALU: 143757 -> 143395 (-0.25%); split: -0.33%, +0.08% VMEM: 36827 -> 36806 (-0.06%) SMEM: 37769 -> 37768 (-0.00%) Reviewed-by: Marek Olšák <maraeo@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41292>
2026-05-06 11:38:05 +02:00 · 2026-04-27 09:14:00 +02:00 · 2026-04-27 09:14:00 +02:00 · fac4edbcba
commit fac4edbcba
parent b2bc57551a
8 changed files with 91 additions and 10 deletions
--- a/src/compiler/nir/nir_opt_varyings.c
+++ b/src/compiler/nir/nir_opt_varyings.c
@ -162,7 +162,13 @@
 *    * Eliminated output stores get the "no_varying" flag if they are also
 *      xfb stores or write sysval outputs.
 *
- * 5. Backward inter-shader code motion
+ * 5. Link signed zero information
+ *
+ *    If no loads care about the sign of zero, and if there is no xfb or
+ *    sign aware sysval usage, set the no_signed_zero flag for output stores.
+ *    This can then be further back propagated by `nir_opt_fp_math_ctrl`.
+ *
+ * 6. Backward inter-shader code motion
 *
 *    "Backward" refers to moving code in the opposite direction that shaders
 *    are executed, i.e. moving code from the consumer to the producer.
@ -329,7 +335,7 @@
 *    the above case (which is temp0 and temp1 to replace all 3 inputs), let
 *    us know.
 *
- * 6. Forward inter-shader code motion
+ * 7. Forward inter-shader code motion
 *
 *    TODO: Not implemented. The text below is a draft of the description.
 *
@ -365,7 +371,7 @@
 *    we don't increase the GPU overhead measurably by moving code across
 *    pipeline stages that amplify GPU work.
 *
- * 7. Compaction to vec4 slots (AKA packing)
+ * 8. Compaction to vec4 slots (AKA packing)
 *
 *    First, varyings are divided into these groups, and components from each
 *    group are assigned locations in this order (effectively forcing
@ -764,6 +770,11 @@ struct linkage_info {
    */
   BITSET_DECLARE(convergent32_mask, NUM_SCALAR_SLOTS);
   BITSET_DECLARE(convergent16_mask, NUM_SCALAR_SLOTS);
+
+   /* Mask of components that have an input load, xfb, or sysval usage that
+    * cares about the sign of zero.
+    */
+   BITSET_DECLARE(signed_zero_mask, NUM_SCALAR_SLOTS);
 };

 /******************************************************************
@ -1060,6 +1071,32 @@ is_active_sysval_output(struct linkage_info *linkage, unsigned slot,
          !nir_intrinsic_io_semantics(intr).no_sysval_output;
 }

+static bool
+is_sz_sysval(struct linkage_info *linkage, unsigned slot,
+             nir_intrinsic_instr *intr)
+{
+   if (!is_active_sysval_output(linkage, slot, intr))
+      return false;
+
+   switch (vec4_slot(slot)) {
+   case VARYING_SLOT_POS:
+   case VARYING_SLOT_CLIP_VERTEX:
+   case VARYING_SLOT_PSIZ:
+   case VARYING_SLOT_CLIP_DIST0:
+   case VARYING_SLOT_CLIP_DIST1:
+   case VARYING_SLOT_CULL_DIST0:
+   case VARYING_SLOT_CULL_DIST1:
+      return false;
+   case VARYING_SLOT_TESS_LEVEL_OUTER:
+   case VARYING_SLOT_TESS_LEVEL_INNER:
+   case VARYING_SLOT_BOUNDING_BOX0:
+   case VARYING_SLOT_BOUNDING_BOX1:
+      /* These enums are aliased with integer mesh outputs. */
+      return linkage->producer_stage != MESA_SHADER_TESS_CTRL;
+   default: return true;
+   }
+}
+
 /**
 * This function acts like a filter. The pass won't touch varyings that
 * return false here, and the return value is saved in the linkage bitmasks,
@ -1276,6 +1313,12 @@ gather_inputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_d
   list_addtail(&node->head, &in->consumer.loads);
   in->num_slots = MAX2(in->num_slots, sem.num_slots);

+   if (!sem.no_signed_zero && intr->intrinsic != nir_intrinsic_load_interpolated_input) {
+      unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
+      for (unsigned i = 0; i < nsz_count; i++)
+         BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
+   }
+
   BITSET_SET(linkage->removable_mask, slot);

   enum fs_vec4_type fs_vec4_type = FS_VEC4_TYPE_NONE;
@ -1555,6 +1598,12 @@ gather_outputs(struct nir_builder *builder, nir_intrinsic_instr *intr, void *cb_
      list_addtail(&node->head, &out->producer.loads);
   }

+   if (is_store ? (is_sz_sysval(linkage, slot, intr) || has_xfb(intr)) : !sem.no_signed_zero) {
+      unsigned nsz_count = nir_src_is_const(offset) ? 1 : sem.num_slots;
+      for (unsigned i = 0; i < nsz_count; i++)
+         BITSET_SET(linkage->signed_zero_mask, slot + i * 8);
+   }
+
   BITSET_SET(linkage->removable_mask, slot);

   /* Indirect indexing. */
@ -4181,6 +4230,35 @@ backward_inter_shader_code_motion(struct linkage_info *linkage,
   return false;
 }

+/******************************************************************
+ * SIGNED ZERO LINKING
+ ******************************************************************/
+
+static void
+link_no_signed_zero(struct linkage_info *linkage,
+                    nir_opt_varyings_progress *progress)
+{
+   for (unsigned slot = 0; slot < NUM_SCALAR_SLOTS; slot++) {
+      struct scalar_slot *scalar_slot = &linkage->slot[slot];
+
+      list_for_each_entry(struct list_node, iter, &scalar_slot->producer.stores, head) {
+         nir_io_semantics sem = nir_intrinsic_io_semantics(iter->instr);
+
+         bool no_signed_zero = true;
+         unsigned nsz_count = nir_src_is_const(*nir_get_io_offset_src(iter->instr)) ? 1 : sem.num_slots;
+         for (unsigned i = 0; i < nsz_count; i++)
+            no_signed_zero &= !BITSET_TEST(linkage->signed_zero_mask, slot + i * 8);
+
+         if (sem.no_signed_zero != no_signed_zero) {
+            *progress |= nir_progress_producer;
+            sem.no_signed_zero = no_signed_zero;
+            nir_intrinsic_set_io_semantics(iter->instr, sem);
+         }
+      }
+   }
+}
+
+
 /******************************************************************
 * COMPACTION
 ******************************************************************/
@ -5396,6 +5474,9 @@ nir_opt_varyings(nir_shader *producer, nir_shader *consumer, bool spirv,
   init_linkage(producer, consumer, spirv, max_uniform_components,
                max_ubos_per_stage, linkage, &progress);

+
+   link_no_signed_zero(linkage, &progress);
+
   /* This must be done after deduplication and before inter-shader code
    * motion.
    */
--- a/src/compiler/nir/tests/opt_varyings_tests_bicm_binary_alu.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_bicm_binary_alu.cpp
@ -82,7 +82,7 @@ TEST_F(nir_opt_varyings_test_bicm_binary_alu, \
      ASSERT_TRUE(!shader_contains_def(b2, load[0])); \
      ASSERT_TRUE(!shader_contains_def(b2, load[1])); \
   } else { \
-      ASSERT_EQ(opt_varyings(), 0); \
+      ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
      ASSERT_TRUE(!shader_contains_alu_op(b1, nir_op_##alu, bitsize)); \
      ASSERT_TRUE(shader_contains_alu_op(b2, nir_op_##alu, bitsize)); \
      ASSERT_TRUE(shader_contains_instr(b1, &store[0]->instr)); \
--- a/src/compiler/nir/tests/opt_varyings_tests_dead_output.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_dead_output.cpp
@ -34,7 +34,7 @@ TEST_F(nir_opt_varyings_test_dead_output, \
      store_output(b1, VARYING_SLOT_##slot, 0, nir_type_float##bitsize, \
                   nir_imm_floatN_t(b1, 0, bitsize), 0); \
   \
-   ASSERT_TRUE(opt_varyings() == 0); \
+   opt_varyings(); \
   ASSERT_TRUE(b1->shader->info.outputs_written == VARYING_BIT_##slot); \
   ASSERT_TRUE(shader_contains_instr(b1, &intr->instr)); \
   ASSERT_TRUE(nir_intrinsic_io_semantics(intr).no_varying == \
--- a/src/compiler/nir/tests/opt_varyings_tests_dedup.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_dedup.cpp
@ -62,7 +62,7 @@ TEST_F(nir_opt_varyings_test_dedup, \
         ASSERT_TRUE(!shader_contains_def(b2, load[1][v])); \
      } \
   } else { \
-      ASSERT_EQ(opt_varyings(), 0); \
+      ASSERT_EQ(opt_varyings() & nir_progress_consumer, 0); \
      for (unsigned v = 0; v < (is_per_vertex(b1, (gl_varying_slot)pslot[1], false) ? 3 : 1); v++) { \
         ASSERT_TRUE(shader_contains_instr(b1, &store[0][v]->instr)); \
         ASSERT_TRUE(shader_contains_instr(b1, &store[1][v]->instr)); \
--- a/src/compiler/nir/tests/opt_varyings_tests_prop_const.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_prop_const.cpp
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_const, \
   SHADER_CONST_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, value, value) \
   \
   if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
-      ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
+      ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
      ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
      ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
   } else { \
--- a/src/compiler/nir/tests/opt_varyings_tests_prop_ubo.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_prop_ubo.cpp
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_ubo, \
   SHADER_UBO_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
   \
   if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
-      ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
+      ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
      ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
      ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
   } else { \
--- a/src/compiler/nir/tests/opt_varyings_tests_prop_uniform.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_prop_uniform.cpp
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform, \
   SHADER_UNIFORM_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
   \
   if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
-      ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
+      ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
      ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
      ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
   } else { \
--- a/src/compiler/nir/tests/opt_varyings_tests_prop_uniform_expr.cpp
+++ b/src/compiler/nir/tests/opt_varyings_tests_prop_uniform_expr.cpp
@ -59,7 +59,7 @@ TEST_F(nir_opt_varyings_test_prop_uniform_expr, \
   SHADER_UNI_EXPR_OUTPUT(producer_stage, consumer_stage, slot, comp, type, bitsize, 1, 1) \
   \
   if (nir_slot_is_sysval_output((gl_varying_slot)pindex, MESA_SHADER_##consumer_stage)) { \
-      ASSERT_TRUE(opt_varyings() == nir_progress_consumer); \
+      ASSERT_TRUE(opt_varyings() & nir_progress_consumer); \
      ASSERT_TRUE(b1->shader->info.outputs_written == BITFIELD64_BIT(pindex)); \
      ASSERT_TRUE(nir_intrinsic_io_semantics(store).no_varying); \
   } else { \