intel/brw/xe3+: Optimize CS/TASK/MESH compile time optimistically assuming SIMD32.

This is similar in principle to the previous commit "intel/brw/xe3+: brw_compile_fs() implementation for Xe3+." but applied to compute-like shader stages. It changes the implementation of brw_compile_cs/task/mesh() to reduce compile time and take advantage of wider dispatch modes more aggressively than the original logic, since as of Xe3 SIMD32 builds succeed without spills in most cases thanks to VRT. The new "optimistic" SIMD selection logic starts with the SIMD width that is potentially highest performance and only compiles additional narrower variants if that fails (typically due to spilling), while the old "pessimistic" logic did the opposite: It started with the narrowest SIMD width and compiled additional variants with increasing register pressure until one of them failed to compile. In typical non-spilling cases where we formerly compiled SIMD16 and SIMD32 variants of the same compute shader, this change will halve the number of backend compilations required to build it. XXX - Possibly don't do this in cases with variable workgroup size until effect on runtime performance can be measured directly. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> v2: Don't do this for now in cases with variable workgroup size, still compile every possible variant in such cases. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
2026-01-01 07:30:09 +01:00 · 2024-09-24 16:53:25 -07:00 · 2024-09-24 16:53:25 -07:00 · 6513bf65c3
commit 6513bf65c3
parent 7e1362e9c0
4 changed files with 51 additions and 24 deletions
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@ -135,6 +135,7 @@ const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler,
               struct brw_compile_cs_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
   struct nir_shader *nir = params->base.nir;
   const struct brw_cs_prog_key *key = params->key;
   struct brw_cs_prog_data *prog_data = params->prog_data;
@ -166,7 +167,9 @@ brw_compile_cs(const struct brw_compiler *compiler,

   std::unique_ptr<fs_visitor> v[3];

-   for (unsigned simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
      if (!brw_simd_should_compile(simd_state, simd))
         continue;

@ -192,16 +195,25 @@ brw_compile_cs(const struct brw_compiler *compiler,
                                             params->base.stats != NULL,
                                             debug_enabled);

-      const int first = brw_simd_first_compiled(simd_state);
-      if (first >= 0)
-         v[simd]->import_uniforms(v[first].get());
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) ||
+         nir->info.workgroup_size_variable;

-      const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
+      if (devinfo->ver < 30 || nir->info.workgroup_size_variable) {
+         const int first = brw_simd_first_compiled(simd_state);
+         if (first >= 0)
+            v[simd]->import_uniforms(v[first].get());
+         assert(allow_spilling == (first < 0 || nir->info.workgroup_size_variable));
+      }

      if (run_cs(*v[simd], allow_spilling)) {
         cs_fill_push_const_info(compiler->devinfo, prog_data);

         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers &&
+             !nir->info.workgroup_size_variable)
+            break;
      } else {
         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
         if (simd > 0) {
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@ -398,6 +398,7 @@ const unsigned *
 brw_compile_task(const struct brw_compiler *compiler,
                 struct brw_compile_task_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
   struct nir_shader *nir = params->base.nir;
   const struct brw_task_prog_key *key = params->key;
   struct brw_task_prog_data *prog_data = params->prog_data;
@ -441,7 +442,9 @@ brw_compile_task(const struct brw_compiler *compiler,

   std::unique_ptr<fs_visitor> v[3];

-   for (unsigned simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
      if (!brw_simd_should_compile(simd_state, simd))
         continue;

@ -467,11 +470,16 @@ brw_compile_task(const struct brw_compiler *compiler,
         v[simd]->import_uniforms(v[first].get());
      }

-      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (run_task_mesh(*v[simd], allow_spilling))
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
+      if (run_task_mesh(*v[simd], allow_spilling)) {
         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
-      else
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
+            break;
+      } else {
         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+      }
   }

   int selected_simd = brw_simd_select(simd_state);
@ -1688,6 +1696,7 @@ const unsigned *
 brw_compile_mesh(const struct brw_compiler *compiler,
                 struct brw_compile_mesh_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
   struct nir_shader *nir = params->base.nir;
   const struct brw_mesh_prog_key *key = params->key;
   struct brw_mesh_prog_data *prog_data = params->prog_data;
@ -1742,7 +1751,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,

   std::unique_ptr<fs_visitor> v[3];

-   for (int simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
      if (!brw_simd_should_compile(simd_state, simd))
         continue;

@ -1780,11 +1791,16 @@ brw_compile_mesh(const struct brw_compiler *compiler,
         v[simd]->import_uniforms(v[first].get());
      }

-      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (run_task_mesh(*v[simd], allow_spilling))
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
+      if (run_task_mesh(*v[simd], allow_spilling)) {
         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
-      else
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
+            break;
+      } else {
         simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+      }
   }

   int selected_simd = brw_simd_select(simd_state);
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@ -87,23 +87,23 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
   const auto prog_data = get_prog_data(state);
   const unsigned width = 8u << simd;

+   if (state.required_width && state.required_width != width) {
+      state.error[simd] = "Different than required dispatch width";
+      return false;
+   }
+
   /* For shaders with variable size workgroup, in most cases we can compile
    * all the variants (exceptions are bindless dispatch & ray queries), since
    * the choice will happen only at dispatch time.
    */
   const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;

-   if (!workgroup_size_variable) {
+   if (!workgroup_size_variable && !state.required_width) {
      if (state.spilled[simd]) {
         state.error[simd] = "Would spill";
         return false;
      }

-      if (state.required_width && state.required_width != width) {
-         state.error[simd] = "Different than required dispatch width";
-         return false;
-      }
-
      if (cs_prog_data) {
         const unsigned workgroup_size = cs_prog_data->local_size[0] *
                                         cs_prog_data->local_size[1] *
@ -112,8 +112,7 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
         unsigned max_threads = state.devinfo->max_cs_workgroup_threads;

         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
-         if (simd > min_simd && state.compiled[simd - 1] &&
-            workgroup_size <= (width / 2)) {
+         if (simd > min_simd && workgroup_size <= (width / 2)) {
            state.error[simd] = "Workgroup size already fits in smaller SIMD";
            return false;
         }
--- a/src/intel/compiler/test_simd_selection.cpp
+++ b/src/intel/compiler/test_simd_selection.cpp
@ -193,7 +193,7 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);

   const unsigned wg_8_1_1[] = { 8, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);

   const unsigned wg_16_1_1[] = { 16, 1, 1 };
   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
@ -240,10 +240,10 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);

   const unsigned wg_8_1_1[] = { 8, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);

   const unsigned wg_16_1_1[] = { 16, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), -1);

   const unsigned wg_32_1_1[] = { 32, 1, 1 };
   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);