diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index c312c27a8a6..b244387fdac 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -1641,7 +1641,7 @@ brw_compile_fs(const struct brw_compiler *compiler, } } - if (devinfo->ver >= 30) { + if (compiler->optimistic_simd_heuristic) { unsigned max_dispatch_width = reqd_dispatch_width ? reqd_dispatch_width : 32; if (max_polygons >= 2 && !key->coarse_pixel) { diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 821c9087265..9e9086beb66 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -112,6 +112,9 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) compiler->lower_dpas = !devinfo->has_systolic || debug_get_bool_option("INTEL_LOWER_DPAS", false); + compiler->optimistic_simd_heuristic = + debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false); + nir_lower_int64_options int64_options = nir_lower_imul64 | nir_lower_isign64 | @@ -244,6 +247,8 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler) bits++; insert_u64_bit(&config, compiler->lower_dpas); bits++; + insert_u64_bit(&config, compiler->optimistic_simd_heuristic); + bits++; enum intel_debug_flag debug_bits[] = { DEBUG_NO_DUAL_OBJECT_GS, diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index d5eb632a731..361c172f596 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -121,6 +121,26 @@ struct brw_compiler { */ bool lower_dpas; + /** + * This can be set to use an "optimistic" SIMD heuristic that + * assumes that the highest SIMD width and polygon count achievable + * without spills will give the highest performance, so the + * compiler doesn't need to try more than that. + * + * As of xe3 most programs compile without spills at 32-wide + * dispatch so with this option enabled typically only a single + * back-end compilation will be done instead of the default + * behavior of one compilation per supported dispatch mode. This + * can speed up the back-end compilation of fragment shaders by a + * 2+ factor, but could also increase compile-time especially on + * pre-xe3 platforms in cases with high register pressure. + * + * Run-time performance of the shaders will be reduced since this + * removes the ability to use a static analysis to estimate the + * relative performance of the dispatch modes supported. + */ + bool optimistic_simd_heuristic; + /** * Calling the ra_allocate function after each register spill can take * several minutes. This option speeds up shader compilation by spilling