diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index df2f050138e..f14fcdbfa02 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -1377,27 +1377,6 @@ brw_nir_optimize(nir_shader *nir, if (nir->info.stage != MESA_SHADER_KERNEL) LOOP_OPT(nir_split_array_vars, nir_var_function_temp); LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp); - LOOP_OPT(nir_opt_deref); - if (LOOP_OPT(nir_opt_memcpy)) - LOOP_OPT(nir_split_var_copies); - LOOP_OPT(nir_lower_vars_to_ssa); - if (!nir->info.var_copies_lowered) { - /* Only run this pass if nir_lower_var_copies was not called - * yet. That would lower away any copy_deref instructions and we - * don't want to introduce any more. - */ - LOOP_OPT(nir_opt_find_array_copies); - } - LOOP_OPT(nir_opt_copy_prop_vars); - LOOP_OPT(nir_opt_dead_write_vars); - LOOP_OPT(nir_opt_combine_stores, nir_var_all); - - LOOP_OPT(nir_opt_ray_queries); - LOOP_OPT(nir_opt_ray_query_ranges); - - LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL); - - LOOP_OPT(nir_opt_copy_prop); LOOP_OPT(nir_lower_phis_to_scalar, NULL, NULL); @@ -1406,39 +1385,21 @@ brw_nir_optimize(nir_shader *nir, LOOP_OPT(nir_opt_cse); LOOP_OPT(nir_opt_combine_stores, nir_var_all); - /* Passing 0 to the peephole select pass causes it to convert - * if-statements that contain only move instructions in the branches - * regardless of the count. - * - * Passing 1 to the peephole select pass causes it to convert - * if-statements that contain at most a single ALU instruction (total) - * in both branches. Before Gfx6, some math instructions were - * prohibitively expensive and the results of compare operations need an - * extra resolve step. For these reasons, this pass is more harmful - * than good on those platforms. - * - * For indirect loads of uniforms (push constants), we assume that array + /* For indirect loads of uniforms (push constants), we assume that array * indices will nearly always be in bounds and the cost of the load is - * low. Therefore there shouldn't be a performance benefit to avoid it. + * low. Therefore there shouldn't be a performance benefit to avoid it. */ nir_opt_peephole_select_options peephole_select_options = { - .limit = 0, + .limit = 8, .indirect_load_ok = true, + .expensive_alu_ok = true, + .discard_ok = true, }; LOOP_OPT(nir_opt_peephole_select, &peephole_select_options); - peephole_select_options.limit = 8; - peephole_select_options.expensive_alu_ok = true; - LOOP_OPT(nir_opt_peephole_select, &peephole_select_options); - LOOP_OPT(nir_opt_intrinsics); - LOOP_OPT(nir_opt_idiv_const, 32); LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic); - LOOP_OPT(nir_opt_generate_bfi); - LOOP_OPT(nir_opt_reassociate_bfi); - - LOOP_OPT(nir_lower_constant_convert_alu_types); LOOP_OPT(nir_opt_constant_folding); LOOP_OPT(nir_opt_dead_cf); @@ -1452,24 +1413,13 @@ brw_nir_optimize(nir_shader *nir, } LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false); - nir_opt_peephole_select_options peephole_discard_options = { - .limit = 0, - .discard_ok = true, - }; - LOOP_OPT(nir_opt_peephole_select, &peephole_discard_options); if (nir->options->max_unroll_iterations != 0) { LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll); } LOOP_OPT(nir_opt_remove_phis); LOOP_OPT(nir_opt_gcm, false); LOOP_OPT(nir_opt_undef); - LOOP_OPT(nir_lower_pack); } while (progress); - - /* Workaround Gfxbench unused local sampler variable which will trigger an - * assert in the opt_large_constants pass. - */ - OPT(nir_remove_dead_variables, nir_var_function_temp, NULL); } static unsigned @@ -1681,12 +1631,34 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, OPT(nir_normalize_cubemap_coords); OPT(nir_lower_global_vars_to_local); + OPT(nir_lower_pack); + OPT(nir_lower_constant_convert_alu_types); OPT(nir_split_var_copies); OPT(nir_split_struct_vars, nir_var_function_temp); + if (OPT(nir_opt_memcpy)) + OPT(nir_split_var_copies); + + OPT(nir_lower_vars_to_ssa); + + /* Run this pass before nir_lower_var_copies: it introduces copy_derefs. */ + OPT(nir_opt_find_array_copies); + brw_nir_optimize(nir, devinfo); + if (nir->info.ray_queries) { + OPT(nir_opt_ray_queries); + OPT(nir_opt_ray_query_ranges); + } + + OPT(nir_opt_deref); + OPT(nir_opt_copy_prop_vars); + OPT(nir_opt_dead_write_vars); + + OPT(nir_lower_vars_to_ssa); + OPT(nir_remove_dead_variables, nir_var_function_temp, NULL); + unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) | (nir->options->lower_flrp32 ? 32 : 0) | @@ -1740,7 +1712,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, nir_variable_mode indirect_mask = brw_nir_no_indirect_mask(compiler, nir->info.stage); - OPT(nir_lower_indirect_derefs_to_if_else_trees, indirect_mask, UINT32_MAX); + if (OPT(nir_lower_indirect_derefs_to_if_else_trees, indirect_mask, UINT32_MAX)) + OPT(nir_lower_vars_to_ssa); /* Even in cases where we can handle indirect temporaries via scratch, we * it can still be expensive. Lower indirects on small arrays to @@ -1755,8 +1728,10 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, * issues are helped but nothing else in shader-db is hurt except for maybe * that one kerbal space program shader. */ - if (!(indirect_mask & nir_var_function_temp)) - OPT(nir_lower_indirect_derefs_to_if_else_trees, nir_var_function_temp, 16); + if (!(indirect_mask & nir_var_function_temp)) { + if (OPT(nir_lower_indirect_derefs_to_if_else_trees, nir_var_function_temp, 16)) + OPT(nir_lower_vars_to_ssa); + } /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and * SSBOs, our back-end is capable of loading an entire vec4 at a time and @@ -1765,9 +1740,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, * optimizer to combine UBO and SSBO load operations and save us some send * messages. */ - OPT(nir_lower_array_deref_of_vec, + if (OPT(nir_lower_array_deref_of_vec, nir_var_mem_ubo | nir_var_mem_ssbo, NULL, - nir_lower_direct_array_deref_of_vec_load); + nir_lower_direct_array_deref_of_vec_load)) { + + OPT(nir_opt_copy_prop_vars); + } /* Clamp load_per_vertex_input of the TCS stage so that we do not generate * loads reading out of bounds. We can do this here because we called @@ -1934,14 +1912,18 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, NIR_PASS(_, producer, nir_lower_io_vars_to_scalar, nir_var_shader_out); NIR_PASS(_, consumer, nir_lower_io_vars_to_scalar, nir_var_shader_in); + NIR_PASS(_, producer, nir_opt_copy_prop_vars); + NIR_PASS(_, consumer, nir_opt_copy_prop_vars); brw_nir_optimize(producer, devinfo); brw_nir_optimize(consumer, devinfo); if (nir_link_opt_varyings(producer, consumer)) brw_nir_optimize(consumer, devinfo); - NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL); - NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); + NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out | + nir_var_function_temp, NULL); + NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in | + nir_var_function_temp, NULL); if (nir_remove_unused_varyings(producer, consumer)) { if (should_print_nir(producer)) { @@ -1955,6 +1937,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, NIR_PASS(_, producer, nir_lower_global_vars_to_local); NIR_PASS(_, consumer, nir_lower_global_vars_to_local); + NIR_PASS(_, producer, nir_opt_copy_prop_vars); + NIR_PASS(_, consumer, nir_opt_copy_prop_vars); brw_nir_optimize(producer, devinfo); brw_nir_optimize(consumer, devinfo); @@ -1992,6 +1976,7 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, NIR_PASS(_, producer, nir_lower_global_vars_to_local); NIR_PASS(_, producer, nir_split_var_copies); NIR_PASS(_, producer, nir_lower_var_copies); + NIR_PASS(_, producer, nir_lower_vars_to_ssa); } if (producer->info.stage == MESA_SHADER_TASK && @@ -2258,7 +2243,7 @@ brw_vectorize_lower_mem_access(nir_shader *nir, const struct brw_compiler *compiler, enum brw_robustness_flags robust_flags) { - bool progress = false; + UNUSED bool progress = false; nir_load_store_vectorize_options options = { .modes = nir_var_mem_ubo | nir_var_mem_ssbo | @@ -2321,17 +2306,11 @@ brw_vectorize_lower_mem_access(nir_shader *nir, .cb_data = &cb_data, }; OPT(nir_lower_mem_access_bit_sizes, &mem_access_options); - - while (progress) { - progress = false; - - OPT(nir_lower_pack); - OPT(nir_opt_copy_prop); - OPT(nir_opt_dce); - OPT(nir_opt_cse); - OPT(nir_opt_algebraic); - OPT(nir_opt_constant_folding); - } + OPT(nir_lower_pack); + OPT(nir_opt_copy_prop); + OPT(nir_opt_dce); + OPT(nir_opt_algebraic); + OPT(nir_opt_cse); /* Do this after the vectorization & brw_nir_rebase_const_offset_ubo_loads * so that we maximize the offset put into the messages. @@ -2565,6 +2544,7 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler, OPT(brw_nir_lower_texture); OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler); + OPT(nir_lower_alu_to_scalar, NULL, NULL); OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL); @@ -2573,9 +2553,10 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_algebraic_before_ffma); } while (progress); + OPT(nir_opt_idiv_const, 32); + if (devinfo->verx10 >= 125) { /* Lower integer division by constants before nir_lower_idiv. */ - OPT(nir_opt_idiv_const, 32); const nir_lower_idiv_options options = { .allow_fp16 = false }; @@ -2641,6 +2622,8 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_shrink_vectors, false); OPT(intel_nir_opt_peephole_imul32x16); + OPT(nir_opt_generate_bfi); + OPT(nir_opt_reassociate_bfi); if (OPT(nir_opt_comparison_pre)) { OPT(nir_opt_copy_prop); @@ -2653,33 +2636,22 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler, * might be under the threshold of conversion to bcsel. */ nir_opt_peephole_select_options peephole_select_options = { - .limit = 0, + .limit = 1, + .expensive_alu_ok = true, }; OPT(nir_opt_peephole_select, &peephole_select_options); - - peephole_select_options.limit = 1; - peephole_select_options.expensive_alu_ok = true; - OPT(nir_opt_peephole_select, &peephole_select_options); } - do { - progress = false; - - OPT(brw_nir_opt_fsat); - OPT(nir_opt_algebraic_late); - OPT(brw_nir_lower_fsign); - - if (progress) { - OPT(nir_opt_constant_folding); - OPT(nir_opt_copy_prop); - OPT(nir_opt_dce); - OPT(nir_opt_cse); - } - } while (progress); + OPT(brw_nir_lower_fsign); + OPT(brw_nir_opt_fsat); + while (OPT(nir_opt_algebraic_late)) { + OPT(nir_opt_copy_prop); + OPT(nir_opt_dce); + OPT(nir_opt_cse); + } OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64); - OPT(nir_lower_alu_to_scalar, NULL, NULL); while (OPT(nir_opt_algebraic_distribute_src_mods)) { @@ -2816,12 +2788,9 @@ brw_postprocess_nir_out_of_ssa(nir_shader *nir, } OPT(nir_convert_from_ssa, true, true); - + OPT(nir_opt_rematerialize_compares); OPT(nir_opt_dce); - if (OPT(nir_opt_rematerialize_compares)) - OPT(nir_opt_dce); - nir_trivialize_registers(nir); nir_sweep(nir);