diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp
index 9c17e7bd02d..250d2513b94 100644
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@@ -188,6 +188,14 @@ brw_compile_cs(const struct brw_compiler *compiler,
       .required_width = brw_required_dispatch_width(&nir->info),
    };
 
+   unsigned pressure[SIMD_COUNT];
+   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
+
+   for (unsigned i = 0; i < SIMD_COUNT; i++) {
+      simd_state.beyond_threshold[i] =
+         pressure[i] > compiler->register_pressure_threshold;
+   }
+
    prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
 
    std::unique_ptr<brw_shader> v[3];
diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index 3c8b500af1b..60b9c2cd009 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -1558,8 +1558,19 @@ brw_compile_fs(const struct brw_compiler *compiler,
       NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_msaa_intel, f);
    }
 
-   brw_postprocess_nir(nir, compiler, 0, params->base.archiver,
-                       debug_enabled, key->base.robust_flags);
+   brw_postprocess_nir_opts(nir, compiler, key->base.robust_flags);
+
+   unsigned pressure[SIMD_COUNT];
+   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
+
+   bool beyond_threshold[SIMD_COUNT] = {};
+   for (unsigned i = (devinfo->ver >= 20 ? 2 : 1); i < SIMD_COUNT; i++) {
+      beyond_threshold[i] =
+         pressure[i] > compiler->register_pressure_threshold;
+   }
+
+   brw_postprocess_nir_out_of_ssa(nir, 0, params->base.archiver,
+                                  debug_enabled);
 
    int per_primitive_offsets[VARYING_SLOT_MAX];
    memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
@@ -1749,7 +1760,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
       }
 
    } else {
-      if ((!has_spilled && dispatch_width_limit >= 16 && INTEL_SIMD(FS, 16)) ||
+      if ((!has_spilled && dispatch_width_limit >= 16 &&
+           !beyond_threshold[1] && INTEL_SIMD(FS, 16)) ||
           reqd_dispatch_width == 16) {
          /* Try a SIMD16 compile */
          brw_shader_params shader_params = base_shader_params;
@@ -1783,6 +1795,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
       /* Currently, the compiler only supports SIMD32 on SNB+ */
       if (!has_spilled &&
           dispatch_width_limit >= 32 &&
+          !beyond_threshold[2] &&
           reqd_dispatch_width == 0 &&
           !simd16_failed && INTEL_SIMD(FS, 32) &&
           !prog_data->base.ray_queries) {
@@ -1821,7 +1834,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
           reqd_dispatch_width == 0) {
 
          if (devinfo->ver >= 20 && max_polygons >= 4 &&
-             dispatch_width_limit >= 32 &&
+             dispatch_width_limit >= 32 && !beyond_threshold[2] &&
              4 * prog_data->num_varying_inputs <= MAX_VARYING &&
              INTEL_SIMD(FS, 4X8)) {
             /* Try a quad-SIMD8 compile */
@@ -1841,7 +1854,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
          }
 
          if (!vmulti && devinfo->ver >= 20 &&
-             dispatch_width_limit >= 32 &&
+             dispatch_width_limit >= 32 && !beyond_threshold[2] &&
              2 * prog_data->num_varying_inputs <= MAX_VARYING &&
              INTEL_SIMD(FS, 2X16)) {
             /* Try a dual-SIMD16 compile */
@@ -1860,7 +1873,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
             }
          }
 
-         if (!vmulti && dispatch_width_limit >= 16 &&
+         if (!vmulti && dispatch_width_limit >= 16 && !beyond_threshold[1] &&
              2 * prog_data->num_varying_inputs <= MAX_VARYING &&
              INTEL_SIMD(FS, 2X8)) {
             /* Try a dual-SIMD8 compile */
diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp
index e245a78c42f..c72990ba791 100644
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@@ -394,6 +394,14 @@ brw_compile_task(const struct brw_compiler *compiler,
 
    brw_debug_archive_nir(params->base.archiver, nir, 0, "before-simd");
 
+   unsigned pressure[SIMD_COUNT];
+   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
+
+   for (unsigned i = 0; i < SIMD_COUNT; i++) {
+      simd_state.beyond_threshold[i] =
+         pressure[i] > compiler->register_pressure_threshold;
+   }
+
    std::unique_ptr<brw_shader> v[3];
 
    for (unsigned i = 0; i < 3; i++) {
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 116bb0f8b5d..1bb38b2df36 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -115,6 +115,22 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
    compiler->optimistic_simd_heuristic =
       debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false);
 
+   /* We have 128 GRFs on Xe2 and earlier, and up to 256 on Xe3.  But we add
+    * a bit of a fudge factor here to allow shaders that are borderline
+    * allocatable to at least try, so we don't lose out on performance.
+    *
+    * On Xe2, 134 seems to be the sweet spot where we get most of the CPU
+    * gains for discarding expensive compilation, but only a few outliers
+    * have a higher pressure and yet manage to succeed at SIMD32 compilation.
+    *
+    * On Xe3 with VRT we can have double the registers thanks to VRT, and
+    * very few shaders fail to compile.  We set the threshold to ~2x the Xe2
+    * value, which still catches something like 80% of the failing shaders
+    * while letting almost all through to the backend for more detailed
+    * throughput analysis.
+    */
+   compiler->register_pressure_threshold = devinfo->ver >= 30 ? 268 : 134;
+
    nir_lower_int64_options int64_options =
       nir_lower_imul64 |
       nir_lower_isign64 |
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index c382ab4bd26..9daaa30610a 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -151,6 +151,18 @@ struct brw_compiler {
     */
    int spilling_rate;
 
+   /**
+    * We perform a quick register pressure estimate at the NIR level before
+    * attempting backend compilation at various SIMD widths.  If the estimated
+    * register pressure for a given SIMD width is beyond this threshold, we
+    * will skip FS and CS compilation at that width.
+    *
+    * This is vaguely the number of GRFs supported by the hardware, with a
+    * bit of a fudge factor because we are only estimating, and sometimes
+    * backend IR scheduling can reduce register pressure.
+    */
+   unsigned register_pressure_threshold;
+
    /**
     * A list of storage formats to lower from the matching return HW format.
     *
diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h
index 9a2f04e0bff..2309d972ac3 100644
--- a/src/intel/compiler/brw_private.h
+++ b/src/intel/compiler/brw_private.h
@@ -70,6 +70,7 @@ struct brw_simd_selection_state {
 
    bool compiled[SIMD_COUNT];
    bool spilled[SIMD_COUNT];
+   bool beyond_threshold[SIMD_COUNT];
 };
 
 inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
diff --git a/src/intel/compiler/brw_simd_selection.cpp b/src/intel/compiler/brw_simd_selection.cpp
index e9ae4d3ae95..cfd5442e47b 100644
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@@ -101,6 +101,13 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
          return false;
       }
 
+      const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
+
+      if (simd > min_simd && state.beyond_threshold[simd]) {
+         state.error[simd] = "estimated to be beyond the pressure threshold";
+         return false;
+      }
+
       if (cs_prog_data) {
          const unsigned workgroup_size = cs_prog_data->local_size[0] *
                                          cs_prog_data->local_size[1] *
@@ -108,7 +115,6 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
 
          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
 
-         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
          if (simd > min_simd && workgroup_size <= (width / 2)) {
             state.error[simd] = "Workgroup size already fits in smaller SIMD";
             return false;