brw: Limit SIMD width based on NIR rather than first backend compile

I originally added this mechanism to have the first (SIMD8) compile note that certain features were in use which would prevent SIMD16/32 from compiling, so we could skip the work of trying those. But these days, there aren't many cases, and the ones we have are easily detectable based on the NIR. We can detect it earlier without even having to do the SIMD8 compile. Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41122>
2026-05-08 02:38:04 +02:00 · 2026-03-09 11:58:13 -07:00 · 2026-03-09 11:58:13 -07:00 · 2729b1608f
commit 2729b1608f
parent c5928d40ae
3 changed files with 59 additions and 86 deletions
--- a/src/intel/compiler/brw/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw/brw_compile_fs.cpp
@ -121,24 +121,9 @@ brw_do_emit_fb_writes(brw_shader &s, int nr_color_regions, bool replicate_alpha)
 static void
 brw_emit_fb_writes(brw_shader &s)
 {
-   const struct intel_device_info *devinfo = s.devinfo;
   assert(s.stage == MESA_SHADER_FRAGMENT);
-   struct brw_fs_prog_data *prog_data = brw_fs_prog_data(s.prog_data);
   brw_fs_prog_key *key = (brw_fs_prog_key*) s.key;

-   if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
-      /* From the 'Render Target Write message' section of the docs:
-       * "Output Stencil is not supported with SIMD16 Render Target Write
-       * Messages."
-       */
-      if (devinfo->ver >= 20)
-         s.limit_dispatch_width(16, "gl_FragStencilRefARB unsupported "
-                                "in SIMD32+ mode.\n");
-      else
-         s.limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
-                                "in SIMD16+ mode.\n");
-   }
-
   /* ANV doesn't know about sample mask output during the wm key creation
    * so we compute if we need replicate alpha and emit alpha to coverage
    * workaround here.
@ -147,29 +132,6 @@ brw_emit_fb_writes(brw_shader &s)
      (key->nr_color_regions > 1 && key->alpha_to_coverage &&
       s.sample_mask.file == BAD_FILE);

-   /* Following condition implements Wa_14017468336:
-    *
-    * "If dual source blend is enabled do not enable SIMD32 dispatch" and
-    * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
-    *  Render Target Select set."
-    */
-   if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
-       prog_data->dual_src_blend) {
-      /* The dual-source RT write messages fail to release the thread
-       * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
-       *
-       * XXX - Emit an extra single-source NULL RT-write marked LastRT in
-       *       order to release the thread dependency without disabling
-       *       SIMD32.
-       *
-       * The dual-source RT write messages may lead to hangs with SIMD16
-       * dispatch on ICL due some unknown reasons, see
-       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
-       */
-      s.limit_dispatch_width(8, "Dual source blending unsupported "
-                             "in SIMD16 and SIMD32 modes.\n");
-   }
-
   brw_do_emit_fb_writes(s, key->nr_color_regions, replicate_alpha);
 }

@ -1299,9 +1261,6 @@ run_fs(brw_shader &s, bool allow_spilling, bool do_rep_send)

   s.payload_ = new brw_fs_thread_payload(s);

-   if (nir->info.ray_queries > 0)
-      s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
-
   if (do_rep_send) {
      assert(s.dispatch_width == 16);
      brw_emit_repclear_shader(s);
@ -1425,6 +1384,63 @@ brw_nir_cleanup_pre_fs_prog_data(brw_pass_tracker *pt)
   } while (pt->progress);
 }

+static unsigned
+limit_fs_dispatch_width(const struct intel_device_info *devinfo,
+                        const nir_shader *nir,
+                        const struct brw_fs_prog_key *key)
+{
+   unsigned limit = 32;
+
+   /* We don't support SIMD32 FS with ray queries.  We could, but the message
+    * is limited to SIMD16, and they're complex enough that SIMD32 isn't
+    * likely to be useful anyway.
+    */
+   if (nir->info.ray_queries > 0)
+      limit = MIN2(limit, 16);
+
+   /* The 'Render Target Write message' section of the docs says:
+    *
+    *    "Output Stencil is not supported with SIMD16 Render Target
+    *     Write Messages."
+    */
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+      limit = MIN2(limit, devinfo->ver >= 20 ? 16 : 8);
+
+   /* Following condition implements Wa_14017468336:
+    *
+    * "If dual source blend is enabled do not enable SIMD32 dispatch" and
+    * "For a thread dispatched as SIMD32, must not issue SIMD8 message with
+    *  Last Render Target Select set."
+    *
+    * The dual-source RT write messages fail to release the thread
+    * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
+    *
+    * XXX - Emit an extra single-source NULL RT-write marked LastRT in
+    *       order to release the thread dependency without disabling SIMD32.
+    *
+    * The dual-source RT write messages may lead to hangs with SIMD16
+    * dispatch on ICL due some unknown reasons, see:
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
+    */
+   if (nir->info.fs.color_is_dual_source &&
+       devinfo->ver >= 11 && devinfo->ver <= 12)
+      limit = MIN2(limit, 8);
+
+   if (devinfo->ver < 20 && key->coarse_pixel) {
+      /* SIMD32 is not supported for coarse pixel shading */
+      limit = MIN2(limit, 16);
+
+      /* SIMD16 coarse pixel shading cannot use the SIMD8 messages required
+       * for dual source blending.
+       */
+      if (nir->info.fs.color_is_dual_source)
+         limit = MIN2(limit, 8);
+   }
+
+   return limit;
+}
+
 const unsigned *
 brw_compile_fs(const struct brw_compiler *compiler,
               struct brw_compile_fs_params *params)
@ -1555,10 +1571,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
   const unsigned reqd_dispatch_width = brw_required_dispatch_width(&nir->info);
   assert(reqd_dispatch_width == 0 || reqd_dispatch_width == 16);

-   /* Limit identified when first variant is compiled, see
-    * brw_shader::limit_dispatch_width().
-    */
-   unsigned dispatch_width_limit = UINT_MAX;
+   const unsigned dispatch_width_limit =
+      limit_fs_dispatch_width(devinfo, nir, key);

   std::unique_ptr<brw_shader> v8, v16, v32, vmulti;
   float throughput = 0;
@ -1589,17 +1603,6 @@ brw_compile_fs(const struct brw_compiler *compiler,
         return NULL;
      }

-      if (key->coarse_pixel) {
-         if (prog_data->dual_src_blend) {
-            v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
-                                     " use SIMD8 messages.\n");
-         }
-         v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
-                                  " pixel shading.\n");
-      }
-
-      dispatch_width_limit = MIN2(dispatch_width_limit, v8->max_dispatch_width);
-
      if (INTEL_SIMD(FS, 8)) {
         assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
         prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
@ -1742,8 +1745,6 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                v16->fail_msg);
            v16.reset();
         } else {
-            dispatch_width_limit = MIN2(dispatch_width_limit, v16->max_dispatch_width);
-
            assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
            prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
            prog_data->base.grf_used = MAX2(prog_data->base.grf_used,
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@ -166,8 +166,6 @@ brw_shader::brw_shader(const brw_shader_params *params)
         api_subgroup_size == 16 ||
         api_subgroup_size == 32);

-   this->max_dispatch_width = 32;
-
   this->failed = false;
   this->fail_msg = NULL;

@ -254,30 +252,6 @@ brw_shader::fail(const char *format, ...)
   va_end(va);
 }

-/**
- * Mark this program as impossible to compile with dispatch width greater
- * than n.
- *
- * During the SIMD8 compile (which happens first), we can detect and flag
- * things that are unsupported in SIMD16+ mode, so the compiler can skip the
- * SIMD16+ compile altogether.
- *
- * During a compile of dispatch width greater than n (if one happens anyway),
- * this just calls fail().
- */
-void
-brw_shader::limit_dispatch_width(unsigned n, const char *msg)
-{
-   if (dispatch_width > n) {
-      fail("%s", msg);
-   } else {
-      max_dispatch_width = MIN2(max_dispatch_width, n);
-      brw_shader_perf_log(compiler, log_data,
-                          "Shader dispatch width limited to SIMD%d: %s\n",
-                          n, msg);
-   }
-}
-
 enum intel_barycentric_mode
 brw_barycentric_mode(const struct brw_fs_prog_key *key,
                     nir_intrinsic_instr *intr)
--- a/src/intel/compiler/brw/brw_shader.h
+++ b/src/intel/compiler/brw/brw_shader.h
@ -86,7 +86,6 @@ public:

   void vfail(const char *msg, va_list args);
   void fail(const char *msg, ...);
-   void limit_dispatch_width(unsigned n, const char *msg);

   void emit_gs_control_data_bits(const brw_reg &vertex_count);
   brw_reg gs_urb_channel_mask(const brw_reg &dword_index);
@ -213,7 +212,6 @@ public:

   const unsigned dispatch_width; /**< 8, 16 or 32 */
   const unsigned max_polygons;
-   unsigned max_dispatch_width;

   /* The API selected subgroup size */
   unsigned api_subgroup_size; /**< 0, 8, 16, 32 */