diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index c312c27a8a6..b244387fdac 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -1641,7 +1641,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
       }
    }
 
-   if (devinfo->ver >= 30) {
+   if (compiler->optimistic_simd_heuristic) {
       unsigned max_dispatch_width = reqd_dispatch_width ? reqd_dispatch_width : 32;
 
       if (max_polygons >= 2 && !key->coarse_pixel) {
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 821c9087265..9e9086beb66 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -112,6 +112,9 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
    compiler->lower_dpas = !devinfo->has_systolic ||
                           debug_get_bool_option("INTEL_LOWER_DPAS", false);
 
+   compiler->optimistic_simd_heuristic =
+      debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false);
+
    nir_lower_int64_options int64_options =
       nir_lower_imul64 |
       nir_lower_isign64 |
@@ -244,6 +247,8 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
    bits++;
    insert_u64_bit(&config, compiler->lower_dpas);
    bits++;
+   insert_u64_bit(&config, compiler->optimistic_simd_heuristic);
+   bits++;
 
    enum intel_debug_flag debug_bits[] = {
       DEBUG_NO_DUAL_OBJECT_GS,
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index d5eb632a731..361c172f596 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -121,6 +121,26 @@ struct brw_compiler {
     */
    bool lower_dpas;
 
+   /**
+    * This can be set to use an "optimistic" SIMD heuristic that
+    * assumes that the highest SIMD width and polygon count achievable
+    * without spills will give the highest performance, so the
+    * compiler doesn't need to try more than that.
+    *
+    * As of xe3 most programs compile without spills at 32-wide
+    * dispatch so with this option enabled typically only a single
+    * back-end compilation will be done instead of the default
+    * behavior of one compilation per supported dispatch mode.  This
+    * can speed up the back-end compilation of fragment shaders by a
+    * 2+ factor, but could also increase compile-time especially on
+    * pre-xe3 platforms in cases with high register pressure.
+    *
+    * Run-time performance of the shaders will be reduced since this
+    * removes the ability to use a static analysis to estimate the
+    * relative performance of the dispatch modes supported.
+    */
+   bool optimistic_simd_heuristic;
+
    /**
     * Calling the ra_allocate function after each register spill can take
     * several minutes. This option speeds up shader compilation by spilling