intel/brw/xe3+: brw_compile_fs() implementation for Xe3+.

This reworks the implementation of brw_compile_fs() to reduce compile time and take advantage of wider dispatch modes more aggressively than the original logic. The new "optimistic" PS compilation logic starts with the SIMD width that is potentially highest performance and only compiles additional narrower variants if that fails (typically due to spilling or hardware restrictions), while the old "pessimistic" logic did the opposite: It started with the narrowest SIMD width and compiled additional variants with increasing register pressure until one of them failed to compile. The main disadvantage of this is that selectively throwing away some of the compiled variants based on the static analysis of their performance behavior will no longer be possible, however this is expected to be less useful on Xe3+ since the GRF space allocated to a thread can be scaled up or down, which leads to less dramatic differences in scheduling between SIMD variants. In typical non-spilling cases where we formerly compiled SIMD16 and SIMD32 variants of the same fragment shader, this change will halve the number of backend compilations required to build a shader. With multi-polygon PS dispatch enabled (which is disabled by default right now) this has an even more dramatic effect since the number of compiler iterations can be reduced down to a fifth in the best case scenario. Even though in most cases we will only attempt to return a single binary from the pixel shader compilation, the hardware allows a pair of PS kernels to be specified, and we'll still take advantage of this when the multi-polygon PS kernel has the potential to have worse performance than the single-polygon shader because only the latter register-allocates successfully at SIMD32 -- Only in such case (SIMD2x8 multi-polygon, SIMD32 single-polygon) we'll continue programming both so the hardware will chose one or the other at runtime depending on the SIMD fullness and number of polygons it can buffer at runtime. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
2025-12-25 06:30:10 +01:00 · 2024-09-18 14:38:19 -07:00 · 2024-09-18 14:38:19 -07:00 · 5b6906076e
commit 5b6906076e
parent 1b2bd1fcb8
1 changed files with 115 additions and 0 deletions
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -1624,6 +1624,121 @@ brw_compile_fs(const struct brw_compiler *compiler,
   }

   if (devinfo->ver >= 30) {
+      unsigned max_dispatch_width = reqd_dispatch_width ? reqd_dispatch_width : 32;
+      fs_visitor *vbase = NULL;
+
+      if (params->max_polygons >= 2 && !key->coarse_pixel) {
+         if (params->max_polygons >= 4 && max_dispatch_width >= 32 &&
+             4 * prog_data->num_varying_inputs <= MAX_VARYING &&
+             INTEL_SIMD(FS, 4X8)) {
+            /* Try a quad-SIMD8 compile */
+            vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                                  prog_data, nir, 32, 4,
+                                                  params->base.stats != NULL,
+                                                  debug_enabled);
+            max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
+
+            if (!run_fs(*vmulti, false, false)) {
+               brw_shader_perf_log(compiler, params->base.log_data,
+                                   "Quad-SIMD8 shader failed to compile: %s\n",
+                                   vmulti->fail_msg);
+            } else {
+               vbase = vmulti.get();
+               multi_cfg = vmulti->cfg;
+               assert(!vmulti->spilled_any_registers);
+            }
+         }
+
+         if (!vbase && max_dispatch_width >= 32 &&
+             2 * prog_data->num_varying_inputs <= MAX_VARYING &&
+             INTEL_SIMD(FS, 2X16)) {
+            /* Try a dual-SIMD16 compile */
+            vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                                  prog_data, nir, 32, 2,
+                                                  params->base.stats != NULL,
+                                                  debug_enabled);
+            max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
+
+            if (!run_fs(*vmulti, false, false)) {
+               brw_shader_perf_log(compiler, params->base.log_data,
+                                   "Dual-SIMD16 shader failed to compile: %s\n",
+                                   vmulti->fail_msg);
+            } else {
+               vbase = vmulti.get();
+               multi_cfg = vmulti->cfg;
+               assert(!vmulti->spilled_any_registers);
+            }
+         }
+
+         if (!vbase && max_dispatch_width >= 16 &&
+             2 * prog_data->num_varying_inputs <= MAX_VARYING &&
+             INTEL_SIMD(FS, 2X8)) {
+            /* Try a dual-SIMD8 compile */
+            vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                                  prog_data, nir, 16, 2,
+                                                  params->base.stats != NULL,
+                                                  debug_enabled);
+            max_dispatch_width = std::min(max_dispatch_width, vmulti->dispatch_width);
+
+            if (!run_fs(*vmulti, false, false)) {
+               brw_shader_perf_log(compiler, params->base.log_data,
+                                   "Dual-SIMD8 shader failed to compile: %s\n",
+                                   vmulti->fail_msg);
+            } else {
+               vbase = vmulti.get();
+               multi_cfg = vmulti->cfg;
+            }
+         }
+      }
+
+      if ((!vbase || vbase->dispatch_width < 32) &&
+          max_dispatch_width >= 32 &&
+          INTEL_SIMD(FS, 32)) {
+         /* Try a SIMD32 compile */
+         v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                            prog_data, nir, 32, 1,
+                                            params->base.stats != NULL,
+                                            debug_enabled);
+         if (vbase)
+            v32->import_uniforms(vbase);
+
+         if (!run_fs(*v32, false, false)) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "SIMD32 shader failed to compile: %s\n",
+                                v32->fail_msg);
+         } else {
+            if (!vbase)
+               vbase = v32.get();
+
+            simd32_cfg = v32->cfg;
+            assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
+            prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
+            prog_data->base.grf_used = std::max(prog_data->base.grf_used,
+                                                v32->grf_used);
+         }
+      }
+
+      if (!vbase && INTEL_SIMD(FS, 16)) {
+         /* Try a SIMD16 compile */
+         v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
+                                            prog_data, nir, 16, 1,
+                                            params->base.stats != NULL,
+                                            debug_enabled);
+
+         if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
+            brw_shader_perf_log(compiler, params->base.log_data,
+                                "SIMD16 shader failed to compile: %s\n",
+                                v16->fail_msg);
+         } else {
+            simd16_cfg = v16->cfg;
+
+            assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
+            prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
+            prog_data->base.grf_used = std::max(prog_data->base.grf_used,
+                                                v16->grf_used);
+         }
+      }
+
   } else {
      if ((!has_spilled && (!v8 || v8->max_dispatch_width >= 16) &&
           INTEL_SIMD(FS, 16)) ||