Merge branch 'intel-nir-passes' into 'main'

Draft: brw: do less stuff in brw_nir_optimize See merge request mesa/mesa!38413
2025-12-20 07:20:10 +01:00 · 2025-12-19 19:47:14 -05:00 · 2025-12-19 19:47:14 -05:00 · bc9c06012a
commit bc9c06012a
parent c430f394c5 b1762023f7
1 changed files with 68 additions and 99 deletions
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -1377,27 +1377,6 @@ brw_nir_optimize(nir_shader *nir,
      if (nir->info.stage != MESA_SHADER_KERNEL)
         LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
      LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
      LOOP_OPT(nir_opt_deref);
      if (LOOP_OPT(nir_opt_memcpy))
         LOOP_OPT(nir_split_var_copies);
      LOOP_OPT(nir_lower_vars_to_ssa);
      if (!nir->info.var_copies_lowered) {
         /* Only run this pass if nir_lower_var_copies was not called
          * yet. That would lower away any copy_deref instructions and we
          * don't want to introduce any more.
          */
         LOOP_OPT(nir_opt_find_array_copies);
      }
      LOOP_OPT(nir_opt_copy_prop_vars);
      LOOP_OPT(nir_opt_dead_write_vars);
      LOOP_OPT(nir_opt_combine_stores, nir_var_all);
      LOOP_OPT(nir_opt_ray_queries);
      LOOP_OPT(nir_opt_ray_query_ranges);
      LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);
      LOOP_OPT(nir_opt_copy_prop);
      LOOP_OPT(nir_lower_phis_to_scalar, NULL, NULL);
@ -1406,39 +1385,21 @@ brw_nir_optimize(nir_shader *nir,
      LOOP_OPT(nir_opt_cse);
      LOOP_OPT(nir_opt_combine_stores, nir_var_all);
-      /* Passing 0 to the peephole select pass causes it to convert
+      /* For indirect loads of uniforms (push constants), we assume that array
       * if-statements that contain only move instructions in the branches
       * regardless of the count.
       *
       * Passing 1 to the peephole select pass causes it to convert
       * if-statements that contain at most a single ALU instruction (total)
       * in both branches.  Before Gfx6, some math instructions were
       * prohibitively expensive and the results of compare operations need an
       * extra resolve step.  For these reasons, this pass is more harmful
       * than good on those platforms.
       *
       * For indirect loads of uniforms (push constants), we assume that array
       * indices will nearly always be in bounds and the cost of the load is
       * low. Therefore there shouldn't be a performance benefit to avoid it.
       */
      nir_opt_peephole_select_options peephole_select_options = {
-         .limit = 0,
+         .limit = 8,
         .indirect_load_ok = true,
         .expensive_alu_ok = true,
         .discard_ok = true,
      };
      LOOP_OPT(nir_opt_peephole_select, &peephole_select_options);
      peephole_select_options.limit = 8;
      peephole_select_options.expensive_alu_ok = true;
      LOOP_OPT(nir_opt_peephole_select, &peephole_select_options);
      LOOP_OPT(nir_opt_intrinsics);
      LOOP_OPT(nir_opt_idiv_const, 32);
      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);
      LOOP_OPT(nir_opt_generate_bfi);
      LOOP_OPT(nir_opt_reassociate_bfi);
      LOOP_OPT(nir_lower_constant_convert_alu_types);
      LOOP_OPT(nir_opt_constant_folding);
      LOOP_OPT(nir_opt_dead_cf);
@ -1452,24 +1413,13 @@ brw_nir_optimize(nir_shader *nir,
      }
      LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
      nir_opt_peephole_select_options peephole_discard_options = {
         .limit = 0,
         .discard_ok = true,
      };
      LOOP_OPT(nir_opt_peephole_select, &peephole_discard_options);
      if (nir->options->max_unroll_iterations != 0) {
         LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
      }
      LOOP_OPT(nir_opt_remove_phis);
      LOOP_OPT(nir_opt_gcm, false);
      LOOP_OPT(nir_opt_undef);
      LOOP_OPT(nir_lower_pack);
   } while (progress);
   /* Workaround Gfxbench unused local sampler variable which will trigger an
    * assert in the opt_large_constants pass.
    */
   OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
 }
 static unsigned
@ -1681,12 +1631,34 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
   OPT(nir_normalize_cubemap_coords);
   OPT(nir_lower_global_vars_to_local);
   OPT(nir_lower_pack);
   OPT(nir_lower_constant_convert_alu_types);
   OPT(nir_split_var_copies);
   OPT(nir_split_struct_vars, nir_var_function_temp);
   if (OPT(nir_opt_memcpy))
      OPT(nir_split_var_copies);
   OPT(nir_lower_vars_to_ssa);
   /* Run this pass before nir_lower_var_copies: it introduces copy_derefs. */
   OPT(nir_opt_find_array_copies);
   brw_nir_optimize(nir, devinfo);
   if (nir->info.ray_queries) {
      OPT(nir_opt_ray_queries);
      OPT(nir_opt_ray_query_ranges);
   }
   OPT(nir_opt_deref);
   OPT(nir_opt_copy_prop_vars);
   OPT(nir_opt_dead_write_vars);
   OPT(nir_lower_vars_to_ssa);
   OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
   unsigned lower_flrp =
      (nir->options->lower_flrp16 ? 16 : 0) |
      (nir->options->lower_flrp32 ? 32 : 0) |
@ -1740,7 +1712,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
   nir_variable_mode indirect_mask =
      brw_nir_no_indirect_mask(compiler, nir->info.stage);
-   OPT(nir_lower_indirect_derefs_to_if_else_trees, indirect_mask, UINT32_MAX);
+   if (OPT(nir_lower_indirect_derefs_to_if_else_trees, indirect_mask, UINT32_MAX))
      OPT(nir_lower_vars_to_ssa);
   /* Even in cases where we can handle indirect temporaries via scratch, we
    * it can still be expensive.  Lower indirects on small arrays to
@ -1755,8 +1728,10 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
    * issues are helped but nothing else in shader-db is hurt except for maybe
    * that one kerbal space program shader.
    */
-   if (!(indirect_mask & nir_var_function_temp))
+   if (!(indirect_mask & nir_var_function_temp)) {
-      OPT(nir_lower_indirect_derefs_to_if_else_trees, nir_var_function_temp, 16);
+      if (OPT(nir_lower_indirect_derefs_to_if_else_trees, nir_var_function_temp, 16))
         OPT(nir_lower_vars_to_ssa);
   }
   /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
    * SSBOs, our back-end is capable of loading an entire vec4 at a time and
@ -1765,9 +1740,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
    * optimizer to combine UBO and SSBO load operations and save us some send
    * messages.
    */
-   OPT(nir_lower_array_deref_of_vec,
+   if (OPT(nir_lower_array_deref_of_vec,
       nir_var_mem_ubo | nir_var_mem_ssbo, NULL,
-       nir_lower_direct_array_deref_of_vec_load);
+       nir_lower_direct_array_deref_of_vec_load)) {
      OPT(nir_opt_copy_prop_vars);
   }
   /* Clamp load_per_vertex_input of the TCS stage so that we do not generate
    * loads reading out of bounds. We can do this here because we called
@ -1934,14 +1912,18 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
   NIR_PASS(_, producer, nir_lower_io_vars_to_scalar, nir_var_shader_out);
   NIR_PASS(_, consumer, nir_lower_io_vars_to_scalar, nir_var_shader_in);
   NIR_PASS(_, producer, nir_opt_copy_prop_vars);
   NIR_PASS(_, consumer, nir_opt_copy_prop_vars);
   brw_nir_optimize(producer, devinfo);
   brw_nir_optimize(consumer, devinfo);
   if (nir_link_opt_varyings(producer, consumer))
      brw_nir_optimize(consumer, devinfo);
-   NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
+   NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out |
-   NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
+                                                    nir_var_function_temp, NULL);
   NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in |
                                                    nir_var_function_temp, NULL);
   if (nir_remove_unused_varyings(producer, consumer)) {
      if (should_print_nir(producer)) {
@ -1955,6 +1937,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
      NIR_PASS(_, producer, nir_lower_global_vars_to_local);
      NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
      NIR_PASS(_, producer, nir_opt_copy_prop_vars);
      NIR_PASS(_, consumer, nir_opt_copy_prop_vars);
      brw_nir_optimize(producer, devinfo);
      brw_nir_optimize(consumer, devinfo);
@ -1992,6 +1976,7 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
      NIR_PASS(_, producer, nir_lower_global_vars_to_local);
      NIR_PASS(_, producer, nir_split_var_copies);
      NIR_PASS(_, producer, nir_lower_var_copies);
      NIR_PASS(_, producer, nir_lower_vars_to_ssa);
   }
   if (producer->info.stage == MESA_SHADER_TASK &&
@ -2258,7 +2243,7 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
                               const struct brw_compiler *compiler,
                               enum brw_robustness_flags robust_flags)
 {
-   bool progress = false;
+   UNUSED bool progress = false;
   nir_load_store_vectorize_options options = {
      .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
@ -2321,17 +2306,11 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
      .cb_data = &cb_data,
   };
   OPT(nir_lower_mem_access_bit_sizes, &mem_access_options);
   while (progress) {
      progress = false;
   OPT(nir_lower_pack);
   OPT(nir_opt_copy_prop);
   OPT(nir_opt_dce);
      OPT(nir_opt_cse);
   OPT(nir_opt_algebraic);
-      OPT(nir_opt_constant_folding);
+   OPT(nir_opt_cse);
   }
   /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads
    * so that we maximize the offset put into the messages.
@ -2565,6 +2544,7 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
   OPT(brw_nir_lower_texture);
   OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
   OPT(nir_lower_alu_to_scalar, NULL, NULL);
   OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL);
@ -2573,9 +2553,10 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
      OPT(nir_opt_algebraic_before_ffma);
   } while (progress);
   OPT(nir_opt_idiv_const, 32);
   if (devinfo->verx10 >= 125) {
      /* Lower integer division by constants before nir_lower_idiv. */
      OPT(nir_opt_idiv_const, 32);
      const nir_lower_idiv_options options = {
         .allow_fp16 = false
      };
@ -2641,6 +2622,8 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
      OPT(nir_opt_shrink_vectors, false);
   OPT(intel_nir_opt_peephole_imul32x16);
   OPT(nir_opt_generate_bfi);
   OPT(nir_opt_reassociate_bfi);
   if (OPT(nir_opt_comparison_pre)) {
      OPT(nir_opt_copy_prop);
@ -2653,33 +2636,22 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
       * might be under the threshold of conversion to bcsel.
       */
      nir_opt_peephole_select_options peephole_select_options = {
-         .limit = 0,
+         .limit = 1,
         .expensive_alu_ok = true,
      };
      OPT(nir_opt_peephole_select, &peephole_select_options);
      peephole_select_options.limit = 1;
      peephole_select_options.expensive_alu_ok = true;
      OPT(nir_opt_peephole_select, &peephole_select_options);
   }
   do {
      progress = false;
      OPT(brw_nir_opt_fsat);
      OPT(nir_opt_algebraic_late);
   OPT(brw_nir_lower_fsign);
   OPT(brw_nir_opt_fsat);
-      if (progress) {
+   while (OPT(nir_opt_algebraic_late)) {
         OPT(nir_opt_constant_folding);
      OPT(nir_opt_copy_prop);
      OPT(nir_opt_dce);
      OPT(nir_opt_cse);
   }
   } while (progress);
   OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
   OPT(nir_lower_alu_to_scalar, NULL, NULL);
   while (OPT(nir_opt_algebraic_distribute_src_mods)) {
@ -2816,10 +2788,7 @@ brw_postprocess_nir_out_of_ssa(nir_shader *nir,
   }
   OPT(nir_convert_from_ssa, true, true);
-
+   OPT(nir_opt_rematerialize_compares);
   OPT(nir_opt_dce);
   if (OPT(nir_opt_rematerialize_compares))
   OPT(nir_opt_dce);
   nir_trivialize_registers(nir);