intel/brw: Allow specifying a required subgroup size for fragment shaders.

On older hardware the "use_rep_send" compile parameter was being implicitly used to request the compilation of the SIMD16 variant of clear pixel shaders that require it due to hardware restrictions. However starting on Gfx12+ this flag is never set since replicated data clears are no longer supported, but BLORP still implicitly relies on the SIMD16 variant being generated even though there's no way for BLORP to explicitly request it. This doesn't cause much of a problem right now since brw_compile_fs() typically generates a SIMD16 kernel unless the SIMD8 kernel spills or SIMD debugging flags are enabled, but it won't work reliably on Xe3+ since we'll start using SIMD32 more aggressively. In order to avoid these issues use the standard required subgroup_size parameter from shader_info to signal that the SIMD16 variant of the shader is needed by the caller. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
2025-12-26 12:50:10 +01:00 · 2024-09-18 14:24:41 -07:00 · 2024-09-18 14:24:41 -07:00 · 80b2355b39
commit 80b2355b39
parent a736757275
3 changed files with 20 additions and 12 deletions
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@ -1576,6 +1576,14 @@ brw_compile_fs(const struct brw_compiler *compiler,
   brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
                                 params->mue_map);

+   /* Either an unrestricted or a fixed SIMD16 subgroup size are
+    * allowed -- The latter is needed for fast clear and replicated
+    * data clear shaders.
+    */
+   const unsigned reqd_dispatch_width = brw_required_dispatch_width(&nir->info);
+   assert(reqd_dispatch_width == SUBGROUP_SIZE_VARYING ||
+          reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16);
+
   std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
      *multi_cfg = NULL;
@ -1613,9 +1621,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
                               " pixel shading.\n");
   }

-   if (!has_spilled &&
-       (!v8 || v8->max_dispatch_width >= 16) &&
-       (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
+   if ((!has_spilled && (!v8 || v8->max_dispatch_width >= 16) &&
+        INTEL_SIMD(FS, 16)) ||
+       reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16) {
      /* Try a SIMD16 compile */
      v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
                                         prog_data, nir, 16, 1,
@ -1645,9 +1653,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
   /* Currently, the compiler only supports SIMD32 on SNB+ */
   if (!has_spilled &&
       (!v8 || v8->max_dispatch_width >= 32) &&
-       (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
-       !simd16_failed &&
-       INTEL_SIMD(FS, 32)) {
+       (!v16 || v16->max_dispatch_width >= 32) &&
+       reqd_dispatch_width == SUBGROUP_SIZE_VARYING &&
+       !simd16_failed && INTEL_SIMD(FS, 32)) {
      /* Try a SIMD32 compile */
      v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
                                         prog_data, nir, 32, 1,
@ -1680,7 +1688,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
   }

   if (devinfo->ver >= 12 && !has_spilled &&
-       params->max_polygons >= 2 && !key->coarse_pixel) {
+       params->max_polygons >= 2 && !key->coarse_pixel &&
+       reqd_dispatch_width == SUBGROUP_SIZE_VARYING) {
      fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
      assert(vbase);

@ -1749,8 +1758,10 @@ brw_compile_fs(const struct brw_compiler *compiler,
      }
   }

-   /* When the caller requests a repclear shader, they want SIMD16-only */
-   if (params->use_rep_send)
+   /* When the caller compiles a repclear or fast clear shader, they
+    * want SIMD16-only.
+    */
+   if (reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16)
      simd8_cfg = NULL;

   brw_generator g(compiler, &params->base, &prog_data->base,
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@ -1999,8 +1999,6 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
   case SUBGROUP_SIZE_REQUIRE_8:
   case SUBGROUP_SIZE_REQUIRE_16:
   case SUBGROUP_SIZE_REQUIRE_32:
-      assert(gl_shader_stage_uses_workgroup(info->stage) ||
-             (info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
      /* These enum values are expressly chosen to be equal to the subgroup
       * size that they require.
       */
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@ -31,7 +31,6 @@ unsigned
 brw_required_dispatch_width(const struct shader_info *info)
 {
   if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
-      assert(gl_shader_stage_uses_workgroup(info->stage));
      /* These enum values are expressly chosen to be equal to the subgroup
       * size that they require.
       */