From 6513bf65c309e2b16485091bf772f1df1086b50f Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Tue, 24 Sep 2024 16:53:25 -0700
Subject: [PATCH] intel/brw/xe3+: Optimize CS/TASK/MESH compile time
 optimistically assuming SIMD32.

This is similar in principle to the previous commit "intel/brw/xe3+:
brw_compile_fs() implementation for Xe3+." but applied to compute-like
shader stages.  It changes the implementation of brw_compile_cs/task/mesh()
to reduce compile time and take advantage of wider dispatch modes more
aggressively than the original logic, since as of Xe3 SIMD32 builds
succeed without spills in most cases thanks to VRT.

The new "optimistic" SIMD selection logic starts with the SIMD width
that is potentially highest performance and only compiles additional
narrower variants if that fails (typically due to spilling), while the
old "pessimistic" logic did the opposite: It started with the
narrowest SIMD width and compiled additional variants with increasing
register pressure until one of them failed to compile.

In typical non-spilling cases where we formerly compiled SIMD16 and
SIMD32 variants of the same compute shader, this change will halve the
number of backend compilations required to build it.

XXX - Possibly don't do this in cases with variable workgroup size
      until effect on runtime performance can be measured directly.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

v2: Don't do this for now in cases with variable workgroup size, still
    compile every possible variant in such cases.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32664>
---
 src/intel/compiler/brw_compile_cs.cpp      | 22 +++++++++++----
 src/intel/compiler/brw_compile_mesh.cpp    | 32 ++++++++++++++++------
 src/intel/compiler/brw_simd_selection.cpp  | 15 +++++-----
 src/intel/compiler/test_simd_selection.cpp |  6 ++--
 4 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp
index 4b1fe6ae974..dd86044f5b8 100644
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@@ -135,6 +135,7 @@ const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler,
                struct brw_compile_cs_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
    struct nir_shader *nir = params->base.nir;
    const struct brw_cs_prog_key *key = params->key;
    struct brw_cs_prog_data *prog_data = params->prog_data;
@@ -166,7 +167,9 @@ brw_compile_cs(const struct brw_compiler *compiler,
 
    std::unique_ptr<fs_visitor> v[3];
 
-   for (unsigned simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
       if (!brw_simd_should_compile(simd_state, simd))
          continue;
 
@@ -192,16 +195,25 @@ brw_compile_cs(const struct brw_compiler *compiler,
                                              params->base.stats != NULL,
                                              debug_enabled);
 
-      const int first = brw_simd_first_compiled(simd_state);
-      if (first >= 0)
-         v[simd]->import_uniforms(v[first].get());
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) ||
+         nir->info.workgroup_size_variable;
 
-      const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
+      if (devinfo->ver < 30 || nir->info.workgroup_size_variable) {
+         const int first = brw_simd_first_compiled(simd_state);
+         if (first >= 0)
+            v[simd]->import_uniforms(v[first].get());
+         assert(allow_spilling == (first < 0 || nir->info.workgroup_size_variable));
+      }
 
       if (run_cs(*v[simd], allow_spilling)) {
          cs_fill_push_const_info(compiler->devinfo, prog_data);
 
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers &&
+             !nir->info.workgroup_size_variable)
+            break;
       } else {
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
          if (simd > 0) {
diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp
index 846ccf4cd24..8316e090357 100644
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@@ -398,6 +398,7 @@ const unsigned *
 brw_compile_task(const struct brw_compiler *compiler,
                  struct brw_compile_task_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
    struct nir_shader *nir = params->base.nir;
    const struct brw_task_prog_key *key = params->key;
    struct brw_task_prog_data *prog_data = params->prog_data;
@@ -441,7 +442,9 @@ brw_compile_task(const struct brw_compiler *compiler,
 
    std::unique_ptr<fs_visitor> v[3];
 
-   for (unsigned simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
       if (!brw_simd_should_compile(simd_state, simd))
          continue;
 
@@ -467,11 +470,16 @@ brw_compile_task(const struct brw_compiler *compiler,
          v[simd]->import_uniforms(v[first].get());
       }
 
-      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (run_task_mesh(*v[simd], allow_spilling))
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
+      if (run_task_mesh(*v[simd], allow_spilling)) {
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
-      else
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
+            break;
+      } else {
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+      }
    }
 
    int selected_simd = brw_simd_select(simd_state);
@@ -1688,6 +1696,7 @@ const unsigned *
 brw_compile_mesh(const struct brw_compiler *compiler,
                  struct brw_compile_mesh_params *params)
 {
+   const struct intel_device_info *devinfo = compiler->devinfo;
    struct nir_shader *nir = params->base.nir;
    const struct brw_mesh_prog_key *key = params->key;
    struct brw_mesh_prog_data *prog_data = params->prog_data;
@@ -1742,7 +1751,9 @@ brw_compile_mesh(const struct brw_compiler *compiler,
 
    std::unique_ptr<fs_visitor> v[3];
 
-   for (int simd = 0; simd < 3; simd++) {
+   for (unsigned i = 0; i < 3; i++) {
+      const unsigned simd = devinfo->ver >= 30 ? 2 - i : i;
+
       if (!brw_simd_should_compile(simd_state, simd))
          continue;
 
@@ -1780,11 +1791,16 @@ brw_compile_mesh(const struct brw_compiler *compiler,
          v[simd]->import_uniforms(v[first].get());
       }
 
-      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (run_task_mesh(*v[simd], allow_spilling))
+      const bool allow_spilling = simd == 0 ||
+         (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1));
+      if (run_task_mesh(*v[simd], allow_spilling)) {
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
-      else
+
+         if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers)
+            break;
+      } else {
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
+      }
    }
 
    int selected_simd = brw_simd_select(simd_state);
diff --git a/src/intel/compiler/brw_simd_selection.cpp b/src/intel/compiler/brw_simd_selection.cpp
index 907b38c3f12..a613189a9ed 100644
--- a/src/intel/compiler/brw_simd_selection.cpp
+++ b/src/intel/compiler/brw_simd_selection.cpp
@@ -87,23 +87,23 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
    const auto prog_data = get_prog_data(state);
    const unsigned width = 8u << simd;
 
+   if (state.required_width && state.required_width != width) {
+      state.error[simd] = "Different than required dispatch width";
+      return false;
+   }
+
    /* For shaders with variable size workgroup, in most cases we can compile
     * all the variants (exceptions are bindless dispatch & ray queries), since
     * the choice will happen only at dispatch time.
     */
    const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
 
-   if (!workgroup_size_variable) {
+   if (!workgroup_size_variable && !state.required_width) {
       if (state.spilled[simd]) {
          state.error[simd] = "Would spill";
          return false;
       }
 
-      if (state.required_width && state.required_width != width) {
-         state.error[simd] = "Different than required dispatch width";
-         return false;
-      }
-
       if (cs_prog_data) {
          const unsigned workgroup_size = cs_prog_data->local_size[0] *
                                          cs_prog_data->local_size[1] *
@@ -112,8 +112,7 @@ brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
 
          const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
-         if (simd > min_simd && state.compiled[simd - 1] &&
-            workgroup_size <= (width / 2)) {
+         if (simd > min_simd && workgroup_size <= (width / 2)) {
             state.error[simd] = "Workgroup size already fits in smaller SIMD";
             return false;
          }
diff --git a/src/intel/compiler/test_simd_selection.cpp b/src/intel/compiler/test_simd_selection.cpp
index f3076b817a0..462d922607f 100644
--- a/src/intel/compiler/test_simd_selection.cpp
+++ b/src/intel/compiler/test_simd_selection.cpp
@@ -193,7 +193,7 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
    ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
 
    const unsigned wg_8_1_1[] = { 8, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);
 
    const unsigned wg_16_1_1[] = { 16, 1, 1 };
    ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
@@ -240,10 +240,10 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
    ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
 
    const unsigned wg_8_1_1[] = { 8, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), -1);
 
    const unsigned wg_16_1_1[] = { 16, 1, 1 };
-   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), -1);
 
    const unsigned wg_32_1_1[] = { 32, 1, 1 };
    ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);