intel/brw: track last successful pass and leave the loop early

This is similar to what RADV implements using the NIR_LOOP_PASS helpers. I have not used those helpers for a couple of reasons: 1. They use the pointer to the optimization function, which doesn't work if the same function is called multiple times in one invocation of the loop (fixable) 2. After fixing them, due to Intel's use of sub-expressions, the amount of code added to wrap the shared macro becomes more than simply reimplementing them for the Intel compiler On most workloads the results are a wash, but on compile heavy workloads like Cyberpunk 2077 and Rise of the Tomb Raider, I saw fossil-db runtimes fall by 1-2% on my ICL, with no changes to the compiled shaders. Caio saw closer to 2.5% on TGL. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27510>
2026-01-07 15:10:12 +01:00 · 2024-01-16 11:20:29 -08:00 · 2024-01-16 11:20:29 -08:00 · 75ede9d9bc
commit 75ede9d9bc
parent b2ee98d2db
1 changed files with 64 additions and 40 deletions
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@ -688,6 +688,29 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
   this_progress;                                          \
 })

+#define LOOP_OPT(pass, ...) ({                             \
+   const unsigned long this_line = __LINE__;               \
+   bool this_progress = false;                             \
+   if (opt_line == this_line)                              \
+      break;                                               \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress) {                                    \
+      progress = true;                                     \
+      opt_line = this_line;                                \
+   }                                                       \
+   this_progress;                                          \
+})
+
+#define LOOP_OPT_NOT_IDEMPOTENT(pass, ...) ({              \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress) {                                    \
+      progress = true;                                     \
+      opt_line = 0;                                        \
+   }                                                       \
+   this_progress;                                          \
+})
+
 void
 brw_nir_optimize(nir_shader *nir,
                 const struct intel_device_info *devinfo)
@ -698,6 +721,7 @@ brw_nir_optimize(nir_shader *nir,
      (nir->options->lower_flrp32 ? 32 : 0) |
      (nir->options->lower_flrp64 ? 64 : 0);

+   unsigned long opt_line = 0;
   do {
      progress = false;
      /* This pass is causing problems with types used by OpenCL :
@ -707,36 +731,36 @@ brw_nir_optimize(nir_shader *nir,
       * code.
       */
      if (nir->info.stage != MESA_SHADER_KERNEL)
-         OPT(nir_split_array_vars, nir_var_function_temp);
-      OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
-      OPT(nir_opt_deref);
-      if (OPT(nir_opt_memcpy))
-         OPT(nir_split_var_copies);
-      OPT(nir_lower_vars_to_ssa);
+         LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
+      LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
+      LOOP_OPT(nir_opt_deref);
+      if (LOOP_OPT(nir_opt_memcpy))
+         LOOP_OPT(nir_split_var_copies);
+      LOOP_OPT(nir_lower_vars_to_ssa);
      if (!nir->info.var_copies_lowered) {
         /* Only run this pass if nir_lower_var_copies was not called
          * yet. That would lower away any copy_deref instructions and we
          * don't want to introduce any more.
          */
-         OPT(nir_opt_find_array_copies);
+         LOOP_OPT(nir_opt_find_array_copies);
      }
-      OPT(nir_opt_copy_prop_vars);
-      OPT(nir_opt_dead_write_vars);
-      OPT(nir_opt_combine_stores, nir_var_all);
+      LOOP_OPT(nir_opt_copy_prop_vars);
+      LOOP_OPT(nir_opt_dead_write_vars);
+      LOOP_OPT(nir_opt_combine_stores, nir_var_all);

-      OPT(nir_opt_ray_queries);
-      OPT(nir_opt_ray_query_ranges);
+      LOOP_OPT(nir_opt_ray_queries);
+      LOOP_OPT(nir_opt_ray_query_ranges);

-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+      LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);

-      OPT(nir_copy_prop);
+      LOOP_OPT(nir_copy_prop);

-      OPT(nir_lower_phis_to_scalar, false);
+      LOOP_OPT(nir_lower_phis_to_scalar, false);

-      OPT(nir_copy_prop);
-      OPT(nir_opt_dce);
-      OPT(nir_opt_cse);
-      OPT(nir_opt_combine_stores, nir_var_all);
+      LOOP_OPT(nir_copy_prop);
+      LOOP_OPT(nir_opt_dce);
+      LOOP_OPT(nir_opt_cse);
+      LOOP_OPT(nir_opt_combine_stores, nir_var_all);

      /* Passing 0 to the peephole select pass causes it to convert
       * if-statements that contain only move instructions in the branches
@ -753,23 +777,23 @@ brw_nir_optimize(nir_shader *nir,
       * indices will nearly always be in bounds and the cost of the load is
       * low.  Therefore there shouldn't be a performance benefit to avoid it.
       */
-      OPT(nir_opt_peephole_select, 0, true, false);
-      OPT(nir_opt_peephole_select, 8, true, true);
+      LOOP_OPT(nir_opt_peephole_select, 0, true, false);
+      LOOP_OPT(nir_opt_peephole_select, 8, true, true);

-      OPT(nir_opt_intrinsics);
-      OPT(nir_opt_idiv_const, 32);
-      OPT(nir_opt_algebraic);
+      LOOP_OPT(nir_opt_intrinsics);
+      LOOP_OPT(nir_opt_idiv_const, 32);
+      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);

-      OPT(nir_opt_reassociate_bfi);
+      LOOP_OPT(nir_opt_reassociate_bfi);

-      OPT(nir_lower_constant_convert_alu_types);
-      OPT(nir_opt_constant_folding);
+      LOOP_OPT(nir_lower_constant_convert_alu_types);
+      LOOP_OPT(nir_opt_constant_folding);

      if (lower_flrp != 0) {
-         if (OPT(nir_lower_flrp,
+         if (LOOP_OPT(nir_lower_flrp,
                 lower_flrp,
                 false /* always_precise */)) {
-            OPT(nir_opt_constant_folding);
+            LOOP_OPT(nir_opt_constant_folding);
         }

         /* Nothing should rematerialize any flrps, so we only need to do this
@ -778,24 +802,24 @@ brw_nir_optimize(nir_shader *nir,
         lower_flrp = 0;
      }

-      OPT(nir_opt_dead_cf);
-      if (OPT(nir_opt_loop)) {
+      LOOP_OPT(nir_opt_dead_cf);
+      if (LOOP_OPT(nir_opt_loop)) {
         /* If nir_opt_loop makes progress, then we need to clean
          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
          * to make progress.
          */
-         OPT(nir_copy_prop);
-         OPT(nir_opt_dce);
+         LOOP_OPT(nir_copy_prop);
+         LOOP_OPT(nir_opt_dce);
      }
-      OPT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
-      OPT(nir_opt_conditional_discard);
+      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
+      LOOP_OPT(nir_opt_conditional_discard);
      if (nir->options->max_unroll_iterations != 0) {
-         OPT(nir_opt_loop_unroll);
+         LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
      }
-      OPT(nir_opt_remove_phis);
-      OPT(nir_opt_gcm, false);
-      OPT(nir_opt_undef);
-      OPT(nir_lower_pack);
+      LOOP_OPT(nir_opt_remove_phis);
+      LOOP_OPT(nir_opt_gcm, false);
+      LOOP_OPT(nir_opt_undef);
+      LOOP_OPT(nir_lower_pack);
   } while (progress);

   /* Workaround Gfxbench unused local sampler variable which will trigger an