nir/opt_move_to_top: add an option to exclude moving at_offset/at_sample loads

This splits the nir_move_to_top_input_loads option into 2 options. The latter option is mainly for at_offset/at_sample loads. Then it updates most places to use only the first option. The rationale is that moving at_sample loads makes Control (game) shaders worse, as per the code comment.
2026-05-08 02:38:04 +02:00 · 2026-04-23 23:22:13 -04:00 · 2026-04-23 23:22:13 -04:00 · d108ed6888
commit d108ed6888
parent 0684976de8
6 changed files with 49 additions and 14 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -372,7 +372,7 @@ radv_postprocess_nir(const struct radv_compiler_info *compiler_info, const struc
         /* Always load all VS inputs at the top to eliminate needless VMEM->s_wait->VMEM sequences.
          * Each s_wait can cost 1000 cycles, so make sure all VS input loads are grouped.
          */
-         NIR_PASS(_, stage->nir, nir_opt_move_to_top, nir_move_to_top_input_loads);
+         NIR_PASS(_, stage->nir, nir_opt_move_to_top, nir_move_to_top_input_loads_simple);
         NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
         NIR_PASS(_, stage->nir, nir_opt_move, sink_opts);
      } else {
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5874,8 +5874,21 @@ typedef enum {
   nir_move_to_entry_block_only = BITFIELD_BIT(0),

   /* Instruction options. */
-   nir_move_to_top_input_loads = BITFIELD_BIT(1),
-   nir_move_to_top_load_smem_amd = BITFIELD_BIT(2),
+
+   /* Simple input loads are non-interpolated loads and interpolated loads
+    * with pixel, centroid, and sample barycentrics. Other barycentrics are
+    * excluded.
+    */
+   nir_move_to_top_input_loads_simple = BITFIELD_BIT(1),
+
+   /* Interpolated loads with non-trivial barycentrics, such as at_offset and
+    * at_sample. (this option is not recommended for Control (game) because
+    * it moves at_sample with complex ALU perspective-correct interpolation
+    * out of conditional blocks)
+    */
+   nir_move_to_top_input_loads_complex_baryc = BITFIELD_BIT(2),
+
+   nir_move_to_top_load_smem_amd = BITFIELD_BIT(3),
 } nir_opt_move_to_top_options;

 bool nir_opt_move_to_top(nir_shader *nir, nir_opt_move_to_top_options options);
--- a/src/compiler/nir/nir_opt_move_to_top.c
+++ b/src/compiler/nir/nir_opt_move_to_top.c
@ -12,9 +12,9 @@
 * of instructions that are moved.
 *
 * Used either as a scheduling optimization or to accommodate hw or compiler
- * backend limitations. You would typically use this if you don't use
- * nir_lower_io_vars_to_temporaries and want to move input loads to top,
- * but note that such global code motion passes often increase register usage.
+ * backend limitations. It would typically be used if
+ * nir_lower_io_vars_to_temporaries isn't used and it's desirable to move input
+ * loads to top, but such global code motion often increases register usage.
 */

 #include "nir.h"
@ -138,10 +138,29 @@ handle_load(nir_builder *b, nir_intrinsic_instr *intr, void *_state)
    * an input load. The specific intrinsics that are moved are
    * listed in can_move_src_to_top.
    */
-   move |= state->options & nir_move_to_top_input_loads &&
-           nir_intrinsic_has_io_semantics(intr) &&
-           nir_intrinsic_infos[intr->intrinsic].has_dest &&
-           !nir_is_output_load(intr);
+   if (state->options & (nir_move_to_top_input_loads_simple |
+                         nir_move_to_top_input_loads_complex_baryc) &&
+       nir_intrinsic_has_io_semantics(intr) &&
+       nir_intrinsic_infos[intr->intrinsic].has_dest &&
+       !nir_is_output_load(intr)) {
+
+      if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
+         nir_intrinsic_instr *baryc =
+            nir_def_as_intrinsic_or_null(intr->src[0].ssa);
+
+         nir_opt_move_to_top_options baryc_option =
+            baryc &&
+            (baryc->intrinsic == nir_intrinsic_load_barycentric_pixel ||
+             baryc->intrinsic == nir_intrinsic_load_barycentric_centroid ||
+             baryc->intrinsic == nir_intrinsic_load_barycentric_sample) ?
+                  nir_move_to_top_input_loads_simple :
+                  nir_move_to_top_input_loads_complex_baryc;
+
+         move |= !!(state->options & baryc_option);
+      } else {
+         move |= !!(state->options & nir_move_to_top_input_loads_simple);
+      }
+   }

   move |= state->options & nir_move_to_top_load_smem_amd &&
           (intr->intrinsic == nir_intrinsic_load_global_amd &&
--- a/src/compiler/nir/nir_opt_varyings.c
+++ b/src/compiler/nir/nir_opt_varyings.c
@ -5527,7 +5527,7 @@ nir_opt_varyings_bulk(nir_shader **shaders, uint32_t num_shaders, bool spirv,
      if (nir->info.stage == MESA_SHADER_FRAGMENT) {
         NIR_PASS(_, nir, nir_opt_move_to_top,
                  nir_move_to_entry_block_only |
-                     nir_move_to_top_input_loads);
+                     nir_move_to_top_input_loads_simple);
      }

      /* nir_opt_varyings requires scalar IO. Scalarize all varyings (not just
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -728,8 +728,11 @@ ir3_finalize_nir(struct ir3_compiler *compiler,
    * more optimal at the top.
    */
   if (s->info.stage == MESA_SHADER_VERTEX ||
-       s->info.stage == MESA_SHADER_FRAGMENT)
-      NIR_PASS(_, s, nir_opt_move_to_top, nir_move_to_top_input_loads);
+       s->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, s, nir_opt_move_to_top,
+               nir_move_to_top_input_loads_simple |
+               nir_move_to_top_input_loads_complex_baryc);
+   }

   if (s->info.stage == MESA_SHADER_GEOMETRY) {
      /* nir_unlower_io_to_vars expects constant indirect offsets to be folded
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -884,7 +884,7 @@ static void si_preprocess_nir(struct si_nir_shader_ctx *ctx)
    */
   if (nir->info.stage == MESA_SHADER_VERTEX ||
       nir->info.stage == MESA_SHADER_FRAGMENT)
-      NIR_PASS(progress, nir, nir_opt_move_to_top, nir_move_to_top_input_loads);
+      NIR_PASS(progress, nir, nir_opt_move_to_top, nir_move_to_top_input_loads_simple);

   /* Remove dead temps before we lower indirect indexing. */
   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_function_temp, NULL);