From 75ede9d9bcda7328e2a98ac9fbf74260caec982f Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 16 Jan 2024 11:20:29 -0800 Subject: [PATCH] intel/brw: track last successful pass and leave the loop early This is similar to what RADV implements using the NIR_LOOP_PASS helpers. I have not used those helpers for a couple of reasons: 1. They use the pointer to the optimization function, which doesn't work if the same function is called multiple times in one invocation of the loop (fixable) 2. After fixing them, due to Intel's use of sub-expressions, the amount of code added to wrap the shared macro becomes more than simply reimplementing them for the Intel compiler On most workloads the results are a wash, but on compile heavy workloads like Cyberpunk 2077 and Rise of the Tomb Raider, I saw fossil-db runtimes fall by 1-2% on my ICL, with no changes to the compiled shaders. Caio saw closer to 2.5% on TGL. Reviewed-by: Caio Oliveira Part-of: --- src/intel/compiler/brw_nir.c | 104 +++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index cb67f538ebd..77f19ec0642 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -688,6 +688,29 @@ brw_nir_lower_fs_outputs(nir_shader *nir) this_progress; \ }) +#define LOOP_OPT(pass, ...) ({ \ + const unsigned long this_line = __LINE__; \ + bool this_progress = false; \ + if (opt_line == this_line) \ + break; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + if (this_progress) { \ + progress = true; \ + opt_line = this_line; \ + } \ + this_progress; \ +}) + +#define LOOP_OPT_NOT_IDEMPOTENT(pass, ...) ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + if (this_progress) { \ + progress = true; \ + opt_line = 0; \ + } \ + this_progress; \ +}) + void brw_nir_optimize(nir_shader *nir, const struct intel_device_info *devinfo) @@ -698,6 +721,7 @@ brw_nir_optimize(nir_shader *nir, (nir->options->lower_flrp32 ? 32 : 0) | (nir->options->lower_flrp64 ? 64 : 0); + unsigned long opt_line = 0; do { progress = false; /* This pass is causing problems with types used by OpenCL : @@ -707,36 +731,36 @@ brw_nir_optimize(nir_shader *nir, * code. */ if (nir->info.stage != MESA_SHADER_KERNEL) - OPT(nir_split_array_vars, nir_var_function_temp); - OPT(nir_shrink_vec_array_vars, nir_var_function_temp); - OPT(nir_opt_deref); - if (OPT(nir_opt_memcpy)) - OPT(nir_split_var_copies); - OPT(nir_lower_vars_to_ssa); + LOOP_OPT(nir_split_array_vars, nir_var_function_temp); + LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp); + LOOP_OPT(nir_opt_deref); + if (LOOP_OPT(nir_opt_memcpy)) + LOOP_OPT(nir_split_var_copies); + LOOP_OPT(nir_lower_vars_to_ssa); if (!nir->info.var_copies_lowered) { /* Only run this pass if nir_lower_var_copies was not called * yet. That would lower away any copy_deref instructions and we * don't want to introduce any more. */ - OPT(nir_opt_find_array_copies); + LOOP_OPT(nir_opt_find_array_copies); } - OPT(nir_opt_copy_prop_vars); - OPT(nir_opt_dead_write_vars); - OPT(nir_opt_combine_stores, nir_var_all); + LOOP_OPT(nir_opt_copy_prop_vars); + LOOP_OPT(nir_opt_dead_write_vars); + LOOP_OPT(nir_opt_combine_stores, nir_var_all); - OPT(nir_opt_ray_queries); - OPT(nir_opt_ray_query_ranges); + LOOP_OPT(nir_opt_ray_queries); + LOOP_OPT(nir_opt_ray_query_ranges); - OPT(nir_lower_alu_to_scalar, NULL, NULL); + LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL); - OPT(nir_copy_prop); + LOOP_OPT(nir_copy_prop); - OPT(nir_lower_phis_to_scalar, false); + LOOP_OPT(nir_lower_phis_to_scalar, false); - OPT(nir_copy_prop); - OPT(nir_opt_dce); - OPT(nir_opt_cse); - OPT(nir_opt_combine_stores, nir_var_all); + LOOP_OPT(nir_copy_prop); + LOOP_OPT(nir_opt_dce); + LOOP_OPT(nir_opt_cse); + LOOP_OPT(nir_opt_combine_stores, nir_var_all); /* Passing 0 to the peephole select pass causes it to convert * if-statements that contain only move instructions in the branches @@ -753,23 +777,23 @@ brw_nir_optimize(nir_shader *nir, * indices will nearly always be in bounds and the cost of the load is * low. Therefore there shouldn't be a performance benefit to avoid it. */ - OPT(nir_opt_peephole_select, 0, true, false); - OPT(nir_opt_peephole_select, 8, true, true); + LOOP_OPT(nir_opt_peephole_select, 0, true, false); + LOOP_OPT(nir_opt_peephole_select, 8, true, true); - OPT(nir_opt_intrinsics); - OPT(nir_opt_idiv_const, 32); - OPT(nir_opt_algebraic); + LOOP_OPT(nir_opt_intrinsics); + LOOP_OPT(nir_opt_idiv_const, 32); + LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic); - OPT(nir_opt_reassociate_bfi); + LOOP_OPT(nir_opt_reassociate_bfi); - OPT(nir_lower_constant_convert_alu_types); - OPT(nir_opt_constant_folding); + LOOP_OPT(nir_lower_constant_convert_alu_types); + LOOP_OPT(nir_opt_constant_folding); if (lower_flrp != 0) { - if (OPT(nir_lower_flrp, + if (LOOP_OPT(nir_lower_flrp, lower_flrp, false /* always_precise */)) { - OPT(nir_opt_constant_folding); + LOOP_OPT(nir_opt_constant_folding); } /* Nothing should rematerialize any flrps, so we only need to do this @@ -778,24 +802,24 @@ brw_nir_optimize(nir_shader *nir, lower_flrp = 0; } - OPT(nir_opt_dead_cf); - if (OPT(nir_opt_loop)) { + LOOP_OPT(nir_opt_dead_cf); + if (LOOP_OPT(nir_opt_loop)) { /* If nir_opt_loop makes progress, then we need to clean * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll * to make progress. */ - OPT(nir_copy_prop); - OPT(nir_opt_dce); + LOOP_OPT(nir_copy_prop); + LOOP_OPT(nir_opt_dce); } - OPT(nir_opt_if, nir_opt_if_optimize_phi_true_false); - OPT(nir_opt_conditional_discard); + LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false); + LOOP_OPT(nir_opt_conditional_discard); if (nir->options->max_unroll_iterations != 0) { - OPT(nir_opt_loop_unroll); + LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll); } - OPT(nir_opt_remove_phis); - OPT(nir_opt_gcm, false); - OPT(nir_opt_undef); - OPT(nir_lower_pack); + LOOP_OPT(nir_opt_remove_phis); + LOOP_OPT(nir_opt_gcm, false); + LOOP_OPT(nir_opt_undef); + LOOP_OPT(nir_lower_pack); } while (progress); /* Workaround Gfxbench unused local sampler variable which will trigger an