intel/brw/xe3+: brw_compile_fs() implementation for Xe3+.

This reworks the implementation of brw_compile_fs() to reduce compile
time and take advantage of wider dispatch modes more aggressively than
the original logic.

The new "optimistic" PS compilation logic starts with the SIMD width
that is potentially highest performance and only compiles additional
narrower variants if that fails (typically due to spilling or hardware
restrictions), while the old "pessimistic" logic did the opposite: It
started with the narrowest SIMD width and compiled additional variants
with increasing register pressure until one of them failed to compile.

The main disadvantage of this is that selectively throwing away some
of the compiled variants based on the static analysis of their
performance behavior will no longer be possible, however this is
expected to be less useful on Xe3+ since the GRF space allocated to a
thread can be scaled up or down, which leads to less dramatic
differences in scheduling between SIMD variants.

In typical non-spilling cases where we formerly compiled SIMD16 and
SIMD32 variants of the same fragment shader, this change will halve
the number of backend compilations required to build a shader.  With
multi-polygon PS dispatch enabled (which is disabled by default right
now) this has an even more dramatic effect since the number of
compiler iterations can be reduced down to a fifth in the best case
scenario.

Even though in most cases we will only attempt to return a single
binary from the pixel shader compilation, the hardware allows a pair
of PS kernels to be specified, and we'll still take advantage of this
when the multi-polygon PS kernel has the potential to have worse
performance than the single-polygon shader because only the latter
register-allocates successfully at SIMD32 -- Only in such case
(SIMD2x8 multi-polygon, SIMD32 single-polygon) we'll continue
programming both so the hardware will chose one or the other at
runtime depending on the SIMD fullness and number of polygons it can
buffer at runtime.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
This commit is contained in:
Francisco Jerez 2024-09-18 14:38:19 -07:00 committed by Marge Bot
parent 1b2bd1fcb8
commit 5b6906076e

View file

@ -1624,6 +1624,121 @@ brw_compile_fs(const struct brw_compiler *compiler,
}
if (devinfo->ver >= 30) {
unsigned max_dispatch_width = reqd_dispatch_width ? reqd_dispatch_width : 32;
fs_visitor *vbase = NULL;
if (params->max_polygons >= 2 && !key->coarse_pixel) {
if (params->max_polygons >= 4 && max_dispatch_width >= 32 &&
4 * prog_data->num_varying_inputs <= MAX_VARYING &&
INTEL_SIMD(FS, 4X8)) {
/* Try a quad-SIMD8 compile */
vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
prog_data, nir, 32, 4,
params->base.stats != NULL,
debug_enabled);
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
if (!run_fs(*vmulti, false, false)) {
brw_shader_perf_log(compiler, params->base.log_data,
"Quad-SIMD8 shader failed to compile: %s\n",
vmulti->fail_msg);
} else {
vbase = vmulti.get();
multi_cfg = vmulti->cfg;
assert(!vmulti->spilled_any_registers);
}
}
if (!vbase && max_dispatch_width >= 32 &&
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
INTEL_SIMD(FS, 2X16)) {
/* Try a dual-SIMD16 compile */
vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
prog_data, nir, 32, 2,
params->base.stats != NULL,
debug_enabled);
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
if (!run_fs(*vmulti, false, false)) {
brw_shader_perf_log(compiler, params->base.log_data,
"Dual-SIMD16 shader failed to compile: %s\n",
vmulti->fail_msg);
} else {
vbase = vmulti.get();
multi_cfg = vmulti->cfg;
assert(!vmulti->spilled_any_registers);
}
}
if (!vbase && max_dispatch_width >= 16 &&
2 * prog_data->num_varying_inputs <= MAX_VARYING &&
INTEL_SIMD(FS, 2X8)) {
/* Try a dual-SIMD8 compile */
vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
prog_data, nir, 16, 2,
params->base.stats != NULL,
debug_enabled);
max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
if (!run_fs(*vmulti, false, false)) {
brw_shader_perf_log(compiler, params->base.log_data,
"Dual-SIMD8 shader failed to compile: %s\n",
vmulti->fail_msg);
} else {
vbase = vmulti.get();
multi_cfg = vmulti->cfg;
}
}
}
if ((!vbase || vbase->dispatch_width < 32) &&
max_dispatch_width >= 32 &&
INTEL_SIMD(FS, 32)) {
/* Try a SIMD32 compile */
v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
prog_data, nir, 32, 1,
params->base.stats != NULL,
debug_enabled);
if (vbase)
v32->import_uniforms(vbase);
if (!run_fs(*v32, false, false)) {
brw_shader_perf_log(compiler, params->base.log_data,
"SIMD32 shader failed to compile: %s\n",
v32->fail_msg);
} else {
if (!vbase)
vbase = v32.get();
simd32_cfg = v32->cfg;
assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
prog_data->base.grf_used = std::max(prog_data->base.grf_used,
v32->grf_used);
}
}
if (!vbase && INTEL_SIMD(FS, 16)) {
/* Try a SIMD16 compile */
v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
prog_data, nir, 16, 1,
params->base.stats != NULL,
debug_enabled);
if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
brw_shader_perf_log(compiler, params->base.log_data,
"SIMD16 shader failed to compile: %s\n",
v16->fail_msg);
} else {
simd16_cfg = v16->cfg;
assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
prog_data->base.grf_used = std::max(prog_data->base.grf_used,
v16->grf_used);
}
}
} else {
if ((!has_spilled && (!v8 || v8->max_dispatch_width >= 16) &&
INTEL_SIMD(FS, 16)) ||