diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp index 9c17e7bd02d..250d2513b94 100644 --- a/src/intel/compiler/brw_compile_cs.cpp +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -188,6 +188,14 @@ brw_compile_cs(const struct brw_compiler *compiler, .required_width = brw_required_dispatch_width(&nir->info), }; + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + for (unsigned i = 0; i < SIMD_COUNT; i++) { + simd_state.beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + prog_data->uses_sampler = brw_nir_uses_sampler(params->base.nir); std::unique_ptr v[3]; diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index 3c8b500af1b..60b9c2cd009 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -1558,8 +1558,19 @@ brw_compile_fs(const struct brw_compiler *compiler, NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_msaa_intel, f); } - brw_postprocess_nir(nir, compiler, 0, params->base.archiver, - debug_enabled, key->base.robust_flags); + brw_postprocess_nir_opts(nir, compiler, key->base.robust_flags); + + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + bool beyond_threshold[SIMD_COUNT] = {}; + for (unsigned i = (devinfo->ver >= 20 ? 2 : 1); i < SIMD_COUNT; i++) { + beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + + brw_postprocess_nir_out_of_ssa(nir, 0, params->base.archiver, + debug_enabled); int per_primitive_offsets[VARYING_SLOT_MAX]; memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); @@ -1749,7 +1760,8 @@ brw_compile_fs(const struct brw_compiler *compiler, } } else { - if ((!has_spilled && dispatch_width_limit >= 16 && INTEL_SIMD(FS, 16)) || + if ((!has_spilled && dispatch_width_limit >= 16 && + !beyond_threshold[1] && INTEL_SIMD(FS, 16)) || reqd_dispatch_width == 16) { /* Try a SIMD16 compile */ brw_shader_params shader_params = base_shader_params; @@ -1783,6 +1795,7 @@ brw_compile_fs(const struct brw_compiler *compiler, /* Currently, the compiler only supports SIMD32 on SNB+ */ if (!has_spilled && dispatch_width_limit >= 32 && + !beyond_threshold[2] && reqd_dispatch_width == 0 && !simd16_failed && INTEL_SIMD(FS, 32) && !prog_data->base.ray_queries) { @@ -1821,7 +1834,7 @@ brw_compile_fs(const struct brw_compiler *compiler, reqd_dispatch_width == 0) { if (devinfo->ver >= 20 && max_polygons >= 4 && - dispatch_width_limit >= 32 && + dispatch_width_limit >= 32 && !beyond_threshold[2] && 4 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 4X8)) { /* Try a quad-SIMD8 compile */ @@ -1841,7 +1854,7 @@ brw_compile_fs(const struct brw_compiler *compiler, } if (!vmulti && devinfo->ver >= 20 && - dispatch_width_limit >= 32 && + dispatch_width_limit >= 32 && !beyond_threshold[2] && 2 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 2X16)) { /* Try a dual-SIMD16 compile */ @@ -1860,7 +1873,7 @@ brw_compile_fs(const struct brw_compiler *compiler, } } - if (!vmulti && dispatch_width_limit >= 16 && + if (!vmulti && dispatch_width_limit >= 16 && !beyond_threshold[1] && 2 * prog_data->num_varying_inputs <= MAX_VARYING && INTEL_SIMD(FS, 2X8)) { /* Try a dual-SIMD8 compile */ diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index e245a78c42f..c72990ba791 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -394,6 +394,14 @@ brw_compile_task(const struct brw_compiler *compiler, brw_debug_archive_nir(params->base.archiver, nir, 0, "before-simd"); + unsigned pressure[SIMD_COUNT]; + brw_nir_quick_pressure_estimate(nir, devinfo, pressure); + + for (unsigned i = 0; i < SIMD_COUNT; i++) { + simd_state.beyond_threshold[i] = + pressure[i] > compiler->register_pressure_threshold; + } + std::unique_ptr v[3]; for (unsigned i = 0; i < 3; i++) { diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 116bb0f8b5d..1bb38b2df36 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -115,6 +115,22 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) compiler->optimistic_simd_heuristic = debug_get_bool_option("INTEL_SIMD_OPTIMISTIC", false); + /* We have 128 GRFs on Xe2 and earlier, and up to 256 on Xe3. But we add + * a bit of a fudge factor here to allow shaders that are borderline + * allocatable to at least try, so we don't lose out on performance. + * + * On Xe2, 134 seems to be the sweet spot where we get most of the CPU + * gains for discarding expensive compilation, but only a few outliers + * have a higher pressure and yet manage to succeed at SIMD32 compilation. + * + * On Xe3 with VRT we can have double the registers thanks to VRT, and + * very few shaders fail to compile. We set the threshold to ~2x the Xe2 + * value, which still catches something like 80% of the failing shaders + * while letting almost all through to the backend for more detailed + * throughput analysis. + */ + compiler->register_pressure_threshold = devinfo->ver >= 30 ? 268 : 134; + nir_lower_int64_options int64_options = nir_lower_imul64 | nir_lower_isign64 | diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index c382ab4bd26..9daaa30610a 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -151,6 +151,18 @@ struct brw_compiler { */ int spilling_rate; + /** + * We perform a quick register pressure estimate at the NIR level before + * attempting backend compilation at various SIMD widths. If the estimated + * register pressure for a given SIMD width is beyond this threshold, we + * will skip FS and CS compilation at that width. + * + * This is vaguely the number of GRFs supported by the hardware, with a + * bit of a fudge factor because we are only estimating, and sometimes + * backend IR scheduling can reduce register pressure. + */ + unsigned register_pressure_threshold; + /** * A list of storage formats to lower from the matching return HW format. * diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h index 9a2f04e0bff..2309d972ac3 100644 --- a/src/intel/compiler/brw_private.h +++ b/src/intel/compiler/brw_private.h @@ -70,6 +70,7 @@ struct brw_simd_selection_state { bool compiled[SIMD_COUNT]; bool spilled[SIMD_COUNT]; + bool beyond_threshold[SIMD_COUNT]; }; inline int brw_simd_first_compiled(const brw_simd_selection_state &state) diff --git a/src/intel/compiler/brw_simd_selection.cpp b/src/intel/compiler/brw_simd_selection.cpp index e9ae4d3ae95..cfd5442e47b 100644 --- a/src/intel/compiler/brw_simd_selection.cpp +++ b/src/intel/compiler/brw_simd_selection.cpp @@ -101,6 +101,13 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd) return false; } + const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0; + + if (simd > min_simd && state.beyond_threshold[simd]) { + state.error[simd] = "estimated to be beyond the pressure threshold"; + return false; + } + if (cs_prog_data) { const unsigned workgroup_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * @@ -108,7 +115,6 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd) unsigned max_threads = state.devinfo->max_cs_workgroup_threads; - const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0; if (simd > min_simd && workgroup_size <= (width / 2)) { state.error[simd] = "Workgroup size already fits in smaller SIMD"; return false;