From 3af4e63061cdc8c5de61bf03ebb967909a1278c9 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Mon, 11 Aug 2025 02:39:49 -0700 Subject: [PATCH] brw: Skip compilation of larger SIMDs when pressure is too high This allows us to skip the entire backend compilation process for large SIMD widths when register pressure is high enough that we'd likely decide to prefer a smaller one in the end anyway. The hope is to make the same decisions as before, but with less CPU overhead. We are making mostly the same decisions as before: | API / Platform | Total Shaders | Changed | % Identical -------------------------------------------------- | VK / Arc A770 | 905,525 | 1,157 | 99.872% | | VK / Arc B580 | 788,127 | 53 | 99.993% | | VK / Panther | 786,333 | 13 | 99.998% | | GL / Arc A770 | 308,618 | 269 | 99.913% | | GL / Arc B580 | 264,066 | 13 | 99.995% | | GL / Panther | 273,212 | 0 | 100.000% | Improves compile times on my i7-12700K: | Game | Arc B580 | Arc A770 | --------------------------------------------------- | Assassins Creed: Odyssey | -13.47% | -10.98% | | Borderlands 3 (DX12) | -10.05% | -11.31% | | Dark Souls 3 | -21.06% | -21.08% | | Oblivion Remastered | -11.10% | -9.82% | | Phasmophobia | -32.73% | -31.00% | | Red Dead Redemption 2 | -20.10% | -14.38% | | Total War: Warhammer III | -10.11% | -14.44% | | Wolfenstein Youngblood | -15.91% | -13.47% | | Shadow of the Tomb Raider | -30.23% | -25.86% | It seems to have nearly no effect on compile times on Xe3 unfortunately, as only 1,014 shaders in fossil-db even fail SIMD32 compilation in the first place, and we want to let most of the "might succeed" cases through to the backend for throughput analysis. Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_compile_cs.cpp | 8 ++++++++ src/intel/compiler/brw_compile_fs.cpp | 25 +++++++++++++++++------ src/intel/compiler/brw_compile_mesh.cpp | 8 ++++++++ src/intel/compiler/brw_compiler.c | 16 +++++++++++++++ src/intel/compiler/brw_compiler.h | 12 +++++++++++ src/intel/compiler/brw_private.h | 1 + src/intel/compiler/brw_simd_selection.cpp | 8 +++++++- 7 files changed, 71 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp index 9c17e7bd02d..250d2513b94 100644 --- a/src/intel/compiler/brw_compile_cs.cpp +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -188,6 +188,14 @@ brw_compile_cs(const struct brw_compiler *compiler, .required_width = brw_required_dispatch_width(&nir->info), }; + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + for (unsigned i = 0; i < SIMD_COUNT; i++) { + simd_state.beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir); std::unique_ptr v[3]; diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index 3c8b500af1b..60b9c2cd009 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -1558,8 +1558,19 @@ brw_compile_fs(const struct brw_compiler *compiler, NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_msaa_intel, f); } - brw_postprocess_nir(nir, compiler, 0, params->base.archiver, - debug_enabled, key->base.robust_flags); + brw_postprocess_nir_opts(nir, compiler, key->base.robust_flags); + + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + bool beyond_threshold[SIMD_COUNT] = {}; + for (unsigned i = (devinfo->ver >= 20 ? 2 : 1); i < SIMD_COUNT; i++) { + beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + + brw_postprocess_nir_out_of_ssa(nir, 0, params->base.archiver, + debug_enabled); int per_primitive_offsets[VARYING_SLOT_MAX]; memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); @@ -1749,7 +1760,8 @@ brw_compile_fs(const struct brw_compiler *compiler, } } else { - if ((!has_spilled && dispatch_width_limit >= 16 && INTEL_SIMD(FS, 16)) || + if ((!has_spilled && dispatch_width_limit >= 16 && + !beyond_threshold[1] && INTEL_SIMD(FS, 16)) || reqd_dispatch_width == 16) { /* Try a SIMD16 compile */ brw_shader_params shader_params = base_shader_params; @@ -1783,6 +1795,7 @@ brw_compile_fs(const struct brw_compiler *compiler, /* Currently, the compiler only supports SIMD32 on SNB+ */ if (!has_spilled && dispatch_width_limit >= 32 && + !beyond_threshold[2] && reqd_dispatch_width == 0 && !simd16_failed && INTEL_SIMD(FS, 32) && !prog_data->base.ray_queries) { @@ -1821,7 +1834,7 @@ brw_compile_fs(const struct brw_compiler *compiler, reqd_dispatch_width == 0) { if (devinfo->ver >= 20 && max_polygons >= 4 && - dispatch_width_limit >= 32 && + dispatch_width_limit >= 32 && !beyond_threshold[2] && 4 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 4X8)) { /* Try a quad-SIMD8 compile */ @@ -1841,7 +1854,7 @@ brw_compile_fs(const struct brw_compiler *compiler, } if (!vmulti && devinfo->ver >= 20 && - dispatch_width_limit >= 32 && + dispatch_width_limit >= 32 && !beyond_threshold[2] && 2 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 2X16)) { /* Try a dual-SIMD16 compile */ @@ -1860,7 +1873,7 @@ brw_compile_fs(const struct brw_compiler *compiler, } } - if (!vmulti && dispatch_width_limit >= 16 && + if (!vmulti && dispatch_width_limit >= 16 && !beyond_threshold[1] && 2 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 2X8)) { /* Try a dual-SIMD8 compile */ diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index e245a78c42f..c72990ba791 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -394,6 +394,14 @@ brw_compile_task(const struct brw_compiler *compiler, brw_debug_archive_nir(params->base.archiver, nir, 0, "before-simd"); + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + for (unsigned i = 0; i < SIMD_COUNT; i++) { + simd_state.beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + std::unique_ptr v[3]; for (unsigned i = 0; i < 3; i++) { diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 116bb0f8b5d..1bb38b2df36 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -115,6 +115,22 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) compiler->optimistic_simd_heuristic = debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false); + /* We have 128 GRFs on Xe2 and earlier, and up to 256 on Xe3. But we add + * a bit of a fudge factor here to allow shaders that are borderline + * allocatable to at least try, so we don't lose out on performance. + * + * On Xe2, 134 seems to be the sweet spot where we get most of the CPU + * gains for discarding expensive compilation, but only a few outliers + * have a higher pressure and yet manage to succeed at SIMD32 compilation. + * + * On Xe3 with VRT we can have double the registers thanks to VRT, and + * very few shaders fail to compile. We set the threshold to ~2x the Xe2 + * value, which still catches something like 80% of the failing shaders + * while letting almost all through to the backend for more detailed + * throughput analysis. + */ + compiler->register_pressure_threshold = devinfo->ver >= 30 ? 268 : 134; + nir_lower_int64_options int64_options = nir_lower_imul64 | nir_lower_isign64 | diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index c382ab4bd26..9daaa30610a 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -151,6 +151,18 @@ struct brw_compiler { */ int spilling_rate; + /** + * We perform a quick register pressure estimate at the NIR level before + * attempting backend compilation at various SIMD widths. If the estimated + * register pressure for a given SIMD width is beyond this threshold, we + * will skip FS and CS compilation at that width. + * + * This is vaguely the number of GRFs supported by the hardware, with a + * bit of a fudge factor because we are only estimating, and sometimes + * backend IR scheduling can reduce register pressure. + */ + unsigned register_pressure_threshold; + /** * A list of storage formats to lower from the matching return HW format. * diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h index 9a2f04e0bff..2309d972ac3 100644 --- a/src/intel/compiler/brw_private.h +++ b/src/intel/compiler/brw_private.h @@ -70,6 +70,7 @@ struct brw_simd_selection_state { bool compiled[SIMD_COUNT]; bool spilled[SIMD_COUNT]; + bool beyond_threshold[SIMD_COUNT]; }; inline int brw_simd_first_compiled(const brw_simd_selection_state &state) diff --git a/src/intel/compiler/brw_simd_selection.cpp b/src/intel/compiler/brw_simd_selection.cpp index e9ae4d3ae95..cfd5442e47b 100644 --- a/src/intel/compiler/brw_simd_selection.cpp +++ b/src/intel/compiler/brw_simd_selection.cpp @@ -101,6 +101,13 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd) return false; } + const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0; + + if (simd > min_simd && state.beyond_threshold[simd]) { + state.error[simd] = "estimated to be beyond the pressure threshold"; + return false; + } + if (cs_prog_data) { const unsigned workgroup_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * @@ -108,7 +115,6 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd) unsigned max_threads = state.devinfo->max_cs_workgroup_threads; - const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0; if (simd > min_simd && workgroup_size <= (width / 2)) { state.error[simd] = "Workgroup size already fits in smaller SIMD"; return false;