brw: Skip compilation of larger SIMDs when pressure is too high

This allows us to skip the entire backend compilation process for large SIMD widths when register pressure is high enough that we'd likely decide to prefer a smaller one in the end anyway. The hope is to make the same decisions as before, but with less CPU overhead. We are making mostly the same decisions as before: | API / Platform | Total Shaders | Changed | % Identical -------------------------------------------------- | VK / Arc A770 | 905,525 | 1,157 | 99.872% | | VK / Arc B580 | 788,127 | 53 | 99.993% | | VK / Panther | 786,333 | 13 | 99.998% | | GL / Arc A770 | 308,618 | 269 | 99.913% | | GL / Arc B580 | 264,066 | 13 | 99.995% | | GL / Panther | 273,212 | 0 | 100.000% | Improves compile times on my i7-12700K: | Game | Arc B580 | Arc A770 | --------------------------------------------------- | Assassins Creed: Odyssey | -13.47% | -10.98% | | Borderlands 3 (DX12) | -10.05% | -11.31% | | Dark Souls 3 | -21.06% | -21.08% | | Oblivion Remastered | -11.10% | -9.82% | | Phasmophobia | -32.73% | -31.00% | | Red Dead Redemption 2 | -20.10% | -14.38% | | Total War: Warhammer III | -10.11% | -14.44% | | Wolfenstein Youngblood | -15.91% | -13.47% | | Shadow of the Tomb Raider | -30.23% | -25.86% | It seems to have nearly no effect on compile times on Xe3 unfortunately, as only 1,014 shaders in fossil-db even fail SIMD32 compilation in the first place, and we want to let most of the "might succeed" cases through to the backend for throughput analysis. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36750>
2025-12-22 13:30:12 +01:00 · 2025-08-11 02:39:49 -07:00 · 2025-08-11 02:39:49 -07:00 · 3af4e63061
commit 3af4e63061
parent 248050b6d0
7 changed files with 71 additions and 7 deletions
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@ -188,6 +188,14 @@ brw_compile_cs(const struct brw_compiler *compiler,
      .required_width = brw_required_dispatch_width(&nir->info),
   };
   unsigned pressure[SIMD_COUNT];
   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
   for (unsigned i = 0; i < SIMD_COUNT; i++) {
      simd_state.beyond_threshold[i] =
         pressure[i] > compiler->register_pressure_threshold;
   }
   prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir);
   std::unique_ptr<brw_shader> v[3];
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -1558,8 +1558,19 @@ brw_compile_fs(const struct brw_compiler *compiler,
      NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_msaa_intel, f);
   }
-   brw_postprocess_nir(nir, compiler, 0, params->base.archiver,
+   brw_postprocess_nir_opts(nir, compiler, key->base.robust_flags);
-                       debug_enabled, key->base.robust_flags);
+
   unsigned pressure[SIMD_COUNT];
   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
   bool beyond_threshold[SIMD_COUNT] = {};
   for (unsigned i = (devinfo->ver >= 20 ? 2 : 1); i < SIMD_COUNT; i++) {
      beyond_threshold[i] =
         pressure[i] > compiler->register_pressure_threshold;
   }
   brw_postprocess_nir_out_of_ssa(nir, 0, params->base.archiver,
                                  debug_enabled);
   int per_primitive_offsets[VARYING_SLOT_MAX];
   memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets));
@ -1749,7 +1760,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
      }
   } else {
-      if ((!has_spilled && dispatch_width_limit >= 16 && INTEL_SIMD(FS, 16)) ||
+      if ((!has_spilled && dispatch_width_limit >= 16 &&
           !beyond_threshold[1] && INTEL_SIMD(FS, 16)) ||
          reqd_dispatch_width == 16) {
         /* Try a SIMD16 compile */
         brw_shader_params shader_params = base_shader_params;
@ -1783,6 +1795,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
      /* Currently, the compiler only supports SIMD32 on SNB+ */
      if (!has_spilled &&
          dispatch_width_limit >= 32 &&
          !beyond_threshold[2] &&
          reqd_dispatch_width == 0 &&
          !simd16_failed && INTEL_SIMD(FS, 32) &&
          !prog_data->base.ray_queries) {
@ -1821,7 +1834,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
          reqd_dispatch_width == 0) {
         if (devinfo->ver >= 20 && max_polygons >= 4 &&
-             dispatch_width_limit >= 32 &&
+             dispatch_width_limit >= 32 && !beyond_threshold[2] &&
             4 * prog_data->num_varying_inputs <= MAX_VARYING &&
             INTEL_SIMD(FS, 4X8)) {
            /* Try a quad-SIMD8 compile */
@ -1841,7 +1854,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
         }
         if (!vmulti && devinfo->ver >= 20 &&
-             dispatch_width_limit >= 32 &&
+             dispatch_width_limit >= 32 && !beyond_threshold[2] &&
             2 * prog_data->num_varying_inputs <= MAX_VARYING &&
             INTEL_SIMD(FS, 2X16)) {
            /* Try a dual-SIMD16 compile */
@ -1860,7 +1873,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
            }
         }
-         if (!vmulti && dispatch_width_limit >= 16 &&
+         if (!vmulti && dispatch_width_limit >= 16 && !beyond_threshold[1] &&
             2 * prog_data->num_varying_inputs <= MAX_VARYING &&
             INTEL_SIMD(FS, 2X8)) {
            /* Try a dual-SIMD8 compile */
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@ -394,6 +394,14 @@ brw_compile_task(const struct brw_compiler *compiler,
   brw_debug_archive_nir(params->base.archiver, nir, 0, "before-simd");
   unsigned pressure[SIMD_COUNT];
   brw_nir_quick_pressure_estimate(nir, devinfo, pressure);
   for (unsigned i = 0; i < SIMD_COUNT; i++) {
      simd_state.beyond_threshold[i] =
         pressure[i] > compiler->register_pressure_threshold;
   }
   std::unique_ptr<brw_shader> v[3];
   for (unsigned i = 0; i < 3; i++) {
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@ -115,6 +115,22 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
   compiler->optimistic_simd_heuristic =
      debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false);
   /* We have 128 GRFs on Xe2 and earlier, and up to 256 on Xe3.  But we add
    * a bit of a fudge factor here to allow shaders that are borderline
    * allocatable to at least try, so we don't lose out on performance.
    *
    * On Xe2, 134 seems to be the sweet spot where we get most of the CPU
    * gains for discarding expensive compilation, but only a few outliers
    * have a higher pressure and yet manage to succeed at SIMD32 compilation.
    *
    * On Xe3 with VRT we can have double the registers thanks to VRT, and
    * very few shaders fail to compile.  We set the threshold to ~2x the Xe2
    * value, which still catches something like 80% of the failing shaders
    * while letting almost all through to the backend for more detailed
    * throughput analysis.
    */
   compiler->register_pressure_threshold = devinfo->ver >= 30 ? 268 : 134;
   nir_lower_int64_options int64_options =
      nir_lower_imul64 |
      nir_lower_isign64 |
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -151,6 +151,18 @@ struct brw_compiler {
    */
   int spilling_rate;
   /**
    * We perform a quick register pressure estimate at the NIR level before
    * attempting backend compilation at various SIMD widths.  If the estimated
    * register pressure for a given SIMD width is beyond this threshold, we
    * will skip FS and CS compilation at that width.
    *
    * This is vaguely the number of GRFs supported by the hardware, with a
    * bit of a fudge factor because we are only estimating, and sometimes
    * backend IR scheduling can reduce register pressure.
    */
   unsigned register_pressure_threshold;
   /**
    * A list of storage formats to lower from the matching return HW format.
    *
--- a/src/intel/compiler/brw_private.h
+++ b/src/intel/compiler/brw_private.h
@ -70,6 +70,7 @@ struct brw_simd_selection_state {
   bool compiled[SIMD_COUNT];
   bool spilled[SIMD_COUNT];
   bool beyond_threshold[SIMD_COUNT];
 };
 inline int brw_simd_first_compiled(const brw_simd_selection_state &state)
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@ -101,6 +101,13 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
         return false;
      }
      const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
      if (simd > min_simd && state.beyond_threshold[simd]) {
         state.error[simd] = "estimated to be beyond the pressure threshold";
         return false;
      }
      if (cs_prog_data) {
         const unsigned workgroup_size = cs_prog_data->local_size[0] *
                                         cs_prog_data->local_size[1] *
@ -108,7 +115,6 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
         unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
         if (simd > min_simd && workgroup_size <= (width / 2)) {
            state.error[simd] = "Workgroup size already fits in smaller SIMD";
            return false;