From 80b2355b394be8bfca62bd127ef5d11088a7fb19 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 18 Sep 2024 14:24:41 -0700
Subject: [PATCH] intel/brw: Allow specifying a required subgroup size for
 fragment shaders.

On older hardware the "use_rep_send" compile parameter was being
implicitly used to request the compilation of the SIMD16 variant of
clear pixel shaders that require it due to hardware restrictions.

However starting on Gfx12+ this flag is never set since replicated
data clears are no longer supported, but BLORP still implicitly relies
on the SIMD16 variant being generated even though there's no way for
BLORP to explicitly request it.  This doesn't cause much of a problem
right now since brw_compile_fs() typically generates a SIMD16 kernel
unless the SIMD8 kernel spills or SIMD debugging flags are enabled,
but it won't work reliably on Xe3+ since we'll start using SIMD32 more
aggressively.

In order to avoid these issues use the standard required subgroup_size
parameter from shader_info to signal that the SIMD16 variant of the
shader is needed by the caller.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
---
 src/intel/compiler/brw_compile_fs.cpp     | 29 ++++++++++++++++-------
 src/intel/compiler/brw_nir.c              |  2 --
 src/intel/compiler/brw_simd_selection.cpp |  1 -
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index bd923b03350..65939b69316 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -1576,6 +1576,14 @@ brw_compile_fs(const struct brw_compiler *compiler,
    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
                                  params->mue_map);
 
+   /* Either an unrestricted or a fixed SIMD16 subgroup size are
+    * allowed -- The latter is needed for fast clear and replicated
+    * data clear shaders.
+    */
+   const unsigned reqd_dispatch_width = brw_required_dispatch_width(&nir->info);
+   assert(reqd_dispatch_width == SUBGROUP_SIZE_VARYING ||
+          reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16);
+
    std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
       *multi_cfg = NULL;
@@ -1613,9 +1621,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                " pixel shading.\n");
    }
 
-   if (!has_spilled &&
-       (!v8 || v8->max_dispatch_width >= 16) &&
-       (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
+   if ((!has_spilled && (!v8 || v8->max_dispatch_width >= 16) &&
+        INTEL_SIMD(FS, 16)) ||
+       reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16) {
       /* Try a SIMD16 compile */
       v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
                                          prog_data, nir, 16, 1,
@@ -1645,9 +1653,9 @@ brw_compile_fs(const struct brw_compiler *compiler,
    /* Currently, the compiler only supports SIMD32 on SNB+ */
    if (!has_spilled &&
        (!v8 || v8->max_dispatch_width >= 32) &&
-       (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
-       !simd16_failed &&
-       INTEL_SIMD(FS, 32)) {
+       (!v16 || v16->max_dispatch_width >= 32) &&
+       reqd_dispatch_width == SUBGROUP_SIZE_VARYING &&
+       !simd16_failed && INTEL_SIMD(FS, 32)) {
       /* Try a SIMD32 compile */
       v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
                                          prog_data, nir, 32, 1,
@@ -1680,7 +1688,8 @@ brw_compile_fs(const struct brw_compiler *compiler,
    }
 
    if (devinfo->ver >= 12 && !has_spilled &&
-       params->max_polygons >= 2 && !key->coarse_pixel) {
+       params->max_polygons >= 2 && !key->coarse_pixel &&
+       reqd_dispatch_width == SUBGROUP_SIZE_VARYING) {
       fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
       assert(vbase);
 
@@ -1749,8 +1758,10 @@ brw_compile_fs(const struct brw_compiler *compiler,
       }
    }
 
-   /* When the caller requests a repclear shader, they want SIMD16-only */
-   if (params->use_rep_send)
+   /* When the caller compiles a repclear or fast clear shader, they
+    * want SIMD16-only.
+    */
+   if (reqd_dispatch_width == SUBGROUP_SIZE_REQUIRE_16)
       simd8_cfg = NULL;
 
    brw_generator g(compiler, &params->base, &prog_data->base,
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 77b20ce95cb..987ff15666e 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1999,8 +1999,6 @@ get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
    case SUBGROUP_SIZE_REQUIRE_8:
    case SUBGROUP_SIZE_REQUIRE_16:
    case SUBGROUP_SIZE_REQUIRE_32:
-      assert(gl_shader_stage_uses_workgroup(info->stage) ||
-             (info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
       /* These enum values are expressly chosen to be equal to the subgroup
        * size that they require.
        */
diff --git a/src/intel/compiler/brw_simd_selection.cpp b/src/intel/compiler/brw_simd_selection.cpp
index 86ed1c28e84..907b38c3f12 100644
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@@ -31,7 +31,6 @@ unsigned
 brw_required_dispatch_width(const struct shader_info *info)
 {
    if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
-      assert(gl_shader_stage_uses_workgroup(info->stage));
       /* These enum values are expressly chosen to be equal to the subgroup
        * size that they require.
        */