From 75ede9d9bcda7328e2a98ac9fbf74260caec982f Mon Sep 17 00:00:00 2001
From: Dylan Baker <dylan.c.baker@intel.com>
Date: Tue, 16 Jan 2024 11:20:29 -0800
Subject: [PATCH] intel/brw: track last successful pass and leave the loop
 early

This is similar to what RADV implements using the NIR_LOOP_PASS
helpers. I have not used those helpers for a couple of reasons:

 1. They use the pointer to the optimization function, which doesn't
    work if the same function is called multiple times in one invocation
    of the loop (fixable)
 2. After fixing them, due to Intel's use of sub-expressions, the amount
    of code added to wrap the shared macro becomes more than simply
    reimplementing them for the Intel compiler

On most workloads the results are a wash, but on compile heavy
workloads like Cyberpunk 2077 and Rise of the Tomb Raider, I saw
fossil-db runtimes fall by 1-2% on my ICL, with no changes to the
compiled shaders. Caio saw closer to 2.5% on TGL.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27510>
---
 src/intel/compiler/brw_nir.c | 104 +++++++++++++++++++++--------------
 1 file changed, 64 insertions(+), 40 deletions(-)

diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index cb67f538ebd..77f19ec0642 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -688,6 +688,29 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
    this_progress;                                          \
 })
 
+#define LOOP_OPT(pass, ...) ({                             \
+   const unsigned long this_line = __LINE__;               \
+   bool this_progress = false;                             \
+   if (opt_line == this_line)                              \
+      break;                                               \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress) {                                    \
+      progress = true;                                     \
+      opt_line = this_line;                                \
+   }                                                       \
+   this_progress;                                          \
+})
+
+#define LOOP_OPT_NOT_IDEMPOTENT(pass, ...) ({              \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress) {                                    \
+      progress = true;                                     \
+      opt_line = 0;                                        \
+   }                                                       \
+   this_progress;                                          \
+})
+
 void
 brw_nir_optimize(nir_shader *nir,
                  const struct intel_device_info *devinfo)
@@ -698,6 +721,7 @@ brw_nir_optimize(nir_shader *nir,
       (nir->options->lower_flrp32 ? 32 : 0) |
       (nir->options->lower_flrp64 ? 64 : 0);
 
+   unsigned long opt_line = 0;
    do {
       progress = false;
       /* This pass is causing problems with types used by OpenCL :
@@ -707,36 +731,36 @@ brw_nir_optimize(nir_shader *nir,
        * code.
        */
       if (nir->info.stage != MESA_SHADER_KERNEL)
-         OPT(nir_split_array_vars, nir_var_function_temp);
-      OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
-      OPT(nir_opt_deref);
-      if (OPT(nir_opt_memcpy))
-         OPT(nir_split_var_copies);
-      OPT(nir_lower_vars_to_ssa);
+         LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
+      LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
+      LOOP_OPT(nir_opt_deref);
+      if (LOOP_OPT(nir_opt_memcpy))
+         LOOP_OPT(nir_split_var_copies);
+      LOOP_OPT(nir_lower_vars_to_ssa);
       if (!nir->info.var_copies_lowered) {
          /* Only run this pass if nir_lower_var_copies was not called
           * yet. That would lower away any copy_deref instructions and we
           * don't want to introduce any more.
           */
-         OPT(nir_opt_find_array_copies);
+         LOOP_OPT(nir_opt_find_array_copies);
       }
-      OPT(nir_opt_copy_prop_vars);
-      OPT(nir_opt_dead_write_vars);
-      OPT(nir_opt_combine_stores, nir_var_all);
+      LOOP_OPT(nir_opt_copy_prop_vars);
+      LOOP_OPT(nir_opt_dead_write_vars);
+      LOOP_OPT(nir_opt_combine_stores, nir_var_all);
 
-      OPT(nir_opt_ray_queries);
-      OPT(nir_opt_ray_query_ranges);
+      LOOP_OPT(nir_opt_ray_queries);
+      LOOP_OPT(nir_opt_ray_query_ranges);
 
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+      LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
-      OPT(nir_copy_prop);
+      LOOP_OPT(nir_copy_prop);
 
-      OPT(nir_lower_phis_to_scalar, false);
+      LOOP_OPT(nir_lower_phis_to_scalar, false);
 
-      OPT(nir_copy_prop);
-      OPT(nir_opt_dce);
-      OPT(nir_opt_cse);
-      OPT(nir_opt_combine_stores, nir_var_all);
+      LOOP_OPT(nir_copy_prop);
+      LOOP_OPT(nir_opt_dce);
+      LOOP_OPT(nir_opt_cse);
+      LOOP_OPT(nir_opt_combine_stores, nir_var_all);
 
       /* Passing 0 to the peephole select pass causes it to convert
        * if-statements that contain only move instructions in the branches
@@ -753,23 +777,23 @@ brw_nir_optimize(nir_shader *nir,
        * indices will nearly always be in bounds and the cost of the load is
        * low.  Therefore there shouldn't be a performance benefit to avoid it.
        */
-      OPT(nir_opt_peephole_select, 0, true, false);
-      OPT(nir_opt_peephole_select, 8, true, true);
+      LOOP_OPT(nir_opt_peephole_select, 0, true, false);
+      LOOP_OPT(nir_opt_peephole_select, 8, true, true);
 
-      OPT(nir_opt_intrinsics);
-      OPT(nir_opt_idiv_const, 32);
-      OPT(nir_opt_algebraic);
+      LOOP_OPT(nir_opt_intrinsics);
+      LOOP_OPT(nir_opt_idiv_const, 32);
+      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);
 
-      OPT(nir_opt_reassociate_bfi);
+      LOOP_OPT(nir_opt_reassociate_bfi);
 
-      OPT(nir_lower_constant_convert_alu_types);
-      OPT(nir_opt_constant_folding);
+      LOOP_OPT(nir_lower_constant_convert_alu_types);
+      LOOP_OPT(nir_opt_constant_folding);
 
       if (lower_flrp != 0) {
-         if (OPT(nir_lower_flrp,
+         if (LOOP_OPT(nir_lower_flrp,
                  lower_flrp,
                  false /* always_precise */)) {
-            OPT(nir_opt_constant_folding);
+            LOOP_OPT(nir_opt_constant_folding);
          }
 
          /* Nothing should rematerialize any flrps, so we only need to do this
@@ -778,24 +802,24 @@ brw_nir_optimize(nir_shader *nir,
          lower_flrp = 0;
       }
 
-      OPT(nir_opt_dead_cf);
-      if (OPT(nir_opt_loop)) {
+      LOOP_OPT(nir_opt_dead_cf);
+      if (LOOP_OPT(nir_opt_loop)) {
          /* If nir_opt_loop makes progress, then we need to clean
           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
           * to make progress.
           */
-         OPT(nir_copy_prop);
-         OPT(nir_opt_dce);
+         LOOP_OPT(nir_copy_prop);
+         LOOP_OPT(nir_opt_dce);
       }
-      OPT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
-      OPT(nir_opt_conditional_discard);
+      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
+      LOOP_OPT(nir_opt_conditional_discard);
       if (nir->options->max_unroll_iterations != 0) {
-         OPT(nir_opt_loop_unroll);
+         LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
       }
-      OPT(nir_opt_remove_phis);
-      OPT(nir_opt_gcm, false);
-      OPT(nir_opt_undef);
-      OPT(nir_lower_pack);
+      LOOP_OPT(nir_opt_remove_phis);
+      LOOP_OPT(nir_opt_gcm, false);
+      LOOP_OPT(nir_opt_undef);
+      LOOP_OPT(nir_lower_pack);
    } while (progress);
 
    /* Workaround Gfxbench unused local sampler variable which will trigger an