intel/compiler: Use SIMD selection helpers for variable workgroup size

Variable workgroup size works by compiling as much SIMD variants as possible and then selecting the right one during dispatch (when the actual workgroup size is passed to us). Instead of replicating the logic in a separate function, reuse the same logic for regular SIMD selection. And move function for that together with the remaining simd selection functions. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13249>
2026-04-15 00:00:36 +02:00 · 2021-10-11 07:49:40 -07:00 · 2021-10-11 07:49:40 -07:00 · 4e7b71e00c
commit 4e7b71e00c
parent 7dda0cf2b8
4 changed files with 138 additions and 36 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -10193,40 +10193,6 @@ brw_compile_cs(const struct brw_compiler *compiler,
   return ret;
 }

-static unsigned
-brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
-                                const struct brw_cs_prog_data *cs_prog_data,
-                                unsigned group_size)
-{
-   const unsigned mask = cs_prog_data->prog_mask;
-   assert(mask != 0);
-
-   static const unsigned simd8  = 1 << 0;
-   static const unsigned simd16 = 1 << 1;
-   static const unsigned simd32 = 1 << 2;
-
-   if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32))
-      return 32;
-
-   const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
-
-   if ((mask & simd8) && group_size <= 8 * max_threads) {
-      /* Prefer SIMD16 if can do without spilling.  Matches logic in
-       * brw_simd_selection.cpp.
-       */
-      if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
-         return 16;
-      return 8;
-   }
-
-   if ((mask & simd16) && group_size <= 16 * max_threads)
-      return 16;
-
-   assert(mask & simd32);
-   assert(group_size <= 32 * max_threads);
-   return 32;
-}
-
 struct brw_cs_dispatch_info
 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
                         const struct brw_cs_prog_data *prog_data,
@ -10238,9 +10204,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
      override_local_size ? override_local_size :
                            prog_data->local_size;

+   const int simd =
+      override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
+                            brw_simd_select(prog_data);
+   assert(simd >= 0 && simd < 3);
+
   info.group_size = sizes[0] * sizes[1] * sizes[2];
-   info.simd_size =
-      brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
+   info.simd_size = 8u << simd;
   info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);

   const uint32_t remainder = info.group_size & (info.simd_size - 1);
--- a/src/intel/compiler/brw_private.h
+++ b/src/intel/compiler/brw_private.h
@ -47,6 +47,10 @@ void brw_simd_mark_compiled(unsigned simd,

 int brw_simd_select(const struct brw_cs_prog_data *prog_data);

+int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                       const struct brw_cs_prog_data *prog_data,
+                                       const unsigned *sizes);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/src/intel/compiler/brw_simd_selection.c
+++ b/src/intel/compiler/brw_simd_selection.c
@ -161,3 +161,42 @@ brw_simd_select(const struct brw_cs_prog_data *prog_data)
   else
      return -1;
 }
+
+int
+brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                   const struct brw_cs_prog_data *prog_data,
+                                   const unsigned *sizes)
+{
+   assert(sizes);
+
+   if (prog_data->local_size[0] == sizes[0] &&
+       prog_data->local_size[1] == sizes[1] &&
+       prog_data->local_size[2] == sizes[2])
+      return brw_simd_select(prog_data);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct brw_cs_prog_data cloned = *prog_data;
+   for (unsigned i = 0; i < 3; i++)
+      cloned.local_size[i] = sizes[i];
+
+   cloned.prog_mask = 0;
+   cloned.prog_spilled = 0;
+
+   const char *error[3] = {0};
+
+   for (unsigned simd = 0; simd < 3; simd++) {
+      /* We are not recompiling, so use original results of prog_mask and
+       * prog_spilled as they will already contain all possible compilations.
+       */
+      if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
+                                  0 /* required_dispatch_width */, &error[simd]) &&
+          test_bit(prog_data->prog_mask, simd)) {
+         brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
+      }
+   }
+
+   ralloc_free(mem_ctx);
+
+   return brw_simd_select(&cloned);
+}
--- a/src/intel/compiler/test_simd_selection.cpp
+++ b/src/intel/compiler/test_simd_selection.cpp
@ -145,6 +145,15 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariable)
   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);

   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
 }

 TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
@ -161,6 +170,86 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
   brw_simd_mark_compiled(SIMD32, prog_data, spilled);

   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);
 }

 TEST_F(SIMDSelectionCS, SpillAtSIMD8)