From 7b651ac6c3e9e5c5971ab162312acedc451a810c Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Sun, 11 Feb 2024 00:45:45 -0800
Subject: [PATCH] intel/elk: Remove Gfx9+ from compile/run functions

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
---
 src/intel/compiler/elk/elk_fs.cpp             | 215 ++----------------
 src/intel/compiler/elk/elk_fs.h               |   1 -
 src/intel/compiler/elk/elk_shader.cpp         |   2 +-
 src/intel/compiler/elk/elk_simd_selection.cpp |  10 +-
 src/intel/compiler/elk/elk_vec4.cpp           |   2 +-
 src/intel/compiler/elk/elk_vec4_tcs.cpp       |   2 +-
 6 files changed, 25 insertions(+), 207 deletions(-)

diff --git a/src/intel/compiler/elk/elk_fs.cpp b/src/intel/compiler/elk/elk_fs.cpp
index 33d26fc7267..6847cbc3235 100644
--- a/src/intel/compiler/elk/elk_fs.cpp
+++ b/src/intel/compiler/elk/elk_fs.cpp
@@ -5935,35 +5935,6 @@ elk_fs_visitor::fixup_3src_null_dest()
                           DEPENDENCY_VARIABLES);
 }
 
-/* Wa_14015360517
- *
- * The first instruction of any kernel should have non-zero emask.
- * Make sure this happens by introducing a dummy mov instruction.
- */
-void
-elk_fs_visitor::emit_dummy_mov_instruction()
-{
-   if (!intel_needs_workaround(devinfo, 14015360517))
-      return;
-
-   struct elk_backend_instruction *first_inst =
-      cfg->first_block()->start();
-
-   /* We can skip the WA if first instruction is marked with
-    * force_writemask_all or exec_size equals dispatch_width.
-    */
-   if (first_inst->force_writemask_all ||
-       first_inst->exec_size == dispatch_width)
-      return;
-
-   /* Insert dummy mov as first instruction. */
-   const fs_builder ubld =
-      fs_builder(this, cfg->first_block(), (elk_fs_inst *)first_inst).exec_all().group(8, 0);
-   ubld.MOV(ubld.null_reg_ud(), elk_imm_ud(0u));
-
-   invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-}
-
 /**
  * Find the first instruction in the program that might start a region of
  * divergent control flow due to a HALT jump.  There is no
@@ -6350,9 +6321,6 @@ elk_fs_visitor::run_vs()
 
    fixup_3src_null_dest();
 
-   /* Wa_14015360517 */
-   emit_dummy_mov_instruction();
-
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -6479,9 +6447,6 @@ elk_fs_visitor::run_tcs()
 
    fixup_3src_null_dest();
 
-   /* Wa_14015360517 */
-   emit_dummy_mov_instruction();
-
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -6510,9 +6475,6 @@ elk_fs_visitor::run_tes()
 
    fixup_3src_null_dest();
 
-   /* Wa_14015360517 */
-   emit_dummy_mov_instruction();
-
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
@@ -6558,41 +6520,11 @@ elk_fs_visitor::run_gs()
 
    fixup_3src_null_dest();
 
-   /* Wa_14015360517 */
-   emit_dummy_mov_instruction();
-
    allocate_registers(true /* allow_spilling */);
 
    return !failed;
 }
 
-/* From the SKL PRM, Volume 16, Workarounds:
- *
- *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
- *              only header phases (R0-R2)
- *
- *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
- *       have been header only.
- *
- * Instead of enabling push constants one can alternatively enable one of the
- * inputs. Here one simply chooses "layer" which shouldn't impose much
- * overhead.
- */
-static void
-gfx9_ps_header_only_workaround(struct elk_wm_prog_data *wm_prog_data)
-{
-   if (wm_prog_data->num_varying_inputs)
-      return;
-
-   if (wm_prog_data->base.curb_read_length)
-      return;
-
-   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
-   wm_prog_data->num_varying_inputs = 1;
-
-   elk_compute_urb_setup_index(wm_prog_data);
-}
-
 bool
 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 {
@@ -6626,11 +6558,9 @@ elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
          for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
             /* According to the "PS Thread Payload for Normal
              * Dispatch" pages on the BSpec, the dispatch mask is
-             * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
-             * gfx6+.
+             * stored in R1.7/R2.7 on gfx6+.
              */
             const elk_fs_reg dispatch_mask =
-               devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
                devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
                elk_vec1_grf(0, 0);
             bld.exec_all().group(1, 0)
@@ -6658,16 +6588,10 @@ elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 
       assign_curb_setup();
 
-      if (devinfo->ver == 9)
-         gfx9_ps_header_only_workaround(wm_prog_data);
-
       assign_urb_setup();
 
       fixup_3src_null_dest();
 
-      /* Wa_14015360517 */
-      emit_dummy_mov_instruction();
-
       allocate_registers(allow_spilling);
    }
 
@@ -6705,9 +6629,6 @@ elk_fs_visitor::run_cs(bool allow_spilling)
 
    fixup_3src_null_dest();
 
-   /* Wa_14015360517 */
-   emit_dummy_mov_instruction();
-
    allocate_registers(allow_spilling);
 
    return !failed;
@@ -7120,32 +7041,29 @@ elk_compile_fs(const struct elk_compiler *compiler,
    elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
 
    std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
-   elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
-      *multi_cfg = NULL;
+   elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
    float throughput = 0;
    bool has_spilled = false;
 
-   if (devinfo->ver < 20) {
-      v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                        prog_data, nir, 8, 1,
-                                        params->base.stats != NULL,
-                                        debug_enabled);
-      if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
-         params->base.error_str = ralloc_strdup(params->base.mem_ctx,
-                                                v8->fail_msg);
-         return NULL;
-      } else if (INTEL_SIMD(FS, 8)) {
-         simd8_cfg = v8->cfg;
+   v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
+                                     prog_data, nir, 8, 1,
+                                     params->base.stats != NULL,
+                                     debug_enabled);
+   if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
+      params->base.error_str = ralloc_strdup(params->base.mem_ctx,
+                                             v8->fail_msg);
+      return NULL;
+   } else if (INTEL_SIMD(FS, 8)) {
+      simd8_cfg = v8->cfg;
 
-         assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
-         prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
+      assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
+      prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
 
-         prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
-         const performance &perf = v8->performance_analysis.require();
-         throughput = MAX2(throughput, perf.throughput);
-         has_spilled = v8->spilled_any_registers;
-         allow_spilling = false;
-      }
+      prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
+      const performance &perf = v8->performance_analysis.require();
+      throughput = MAX2(throughput, perf.throughput);
+      has_spilled = v8->spilled_any_registers;
+      allow_spilling = false;
    }
 
    /* Limit dispatch width to simd8 with dual source blending on gfx8.
@@ -7158,18 +7076,6 @@ elk_compile_fs(const struct elk_compiler *compiler,
                                "using SIMD8 when dual src blending.\n");
    }
 
-   if (key->coarse_pixel && devinfo->ver < 20) {
-      if (prog_data->dual_src_blend) {
-         v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
-                                  " use SIMD8 messages.\n");
-      }
-      v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
-                               " pixel shading.\n");
-   }
-
-   if (nir->info.ray_queries > 0 && v8)
-      v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
-
    if (!has_spilled &&
        (!v8 || v8->max_dispatch_width >= 16) &&
        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
@@ -7238,78 +7144,6 @@ elk_compile_fs(const struct elk_compiler *compiler,
       }
    }
 
-   if (devinfo->ver >= 12 && !has_spilled &&
-       params->max_polygons >= 2 && !key->coarse_pixel) {
-      elk_fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
-      assert(vbase);
-
-      if (devinfo->ver >= 20 &&
-          params->max_polygons >= 4 &&
-          vbase->max_dispatch_width >= 32 &&
-          4 * prog_data->num_varying_inputs <= MAX_VARYING &&
-          INTEL_SIMD(FS, 4X8)) {
-         /* Try a quad-SIMD8 compile */
-         vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                               prog_data, nir, 32, 4,
-                                               params->base.stats != NULL,
-                                               debug_enabled);
-         vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(false, params->use_rep_send)) {
-            elk_shader_perf_log(compiler, params->base.log_data,
-                                "Quad-SIMD8 shader failed to compile: %s\n",
-                                vmulti->fail_msg);
-         } else {
-            multi_cfg = vmulti->cfg;
-            assert(!vmulti->spilled_any_registers);
-         }
-      }
-
-      if (!multi_cfg && devinfo->ver >= 20 &&
-          vbase->max_dispatch_width >= 32 &&
-          2 * prog_data->num_varying_inputs <= MAX_VARYING &&
-          INTEL_SIMD(FS, 2X16)) {
-         /* Try a dual-SIMD16 compile */
-         vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                               prog_data, nir, 32, 2,
-                                               params->base.stats != NULL,
-                                               debug_enabled);
-         vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(false, params->use_rep_send)) {
-            elk_shader_perf_log(compiler, params->base.log_data,
-                                "Dual-SIMD16 shader failed to compile: %s\n",
-                                vmulti->fail_msg);
-         } else {
-            multi_cfg = vmulti->cfg;
-            assert(!vmulti->spilled_any_registers);
-         }
-      }
-
-      if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
-          2 * prog_data->num_varying_inputs <= MAX_VARYING &&
-          INTEL_SIMD(FS, 2X8)) {
-         /* Try a dual-SIMD8 compile */
-         vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
-                                               prog_data, nir, 16, 2,
-                                               params->base.stats != NULL,
-                                               debug_enabled);
-         vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
-            elk_shader_perf_log(compiler, params->base.log_data,
-                                "Dual-SIMD8 shader failed to compile: %s\n",
-                                vmulti->fail_msg);
-         } else {
-            multi_cfg = vmulti->cfg;
-         }
-      }
-
-      if (multi_cfg) {
-         assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
-         prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
-
-         prog_data->reg_blocks_8 = elk_register_blocks(vmulti->grf_used);
-      }
-   }
-
    /* When the caller requests a repclear shader, they want SIMD16-only */
    if (params->use_rep_send)
       simd8_cfg = NULL;
@@ -7358,16 +7192,7 @@ elk_compile_fs(const struct elk_compiler *compiler,
    struct elk_compile_stats *stats = params->base.stats;
    uint32_t max_dispatch_width = 0;
 
-   if (multi_cfg) {
-      prog_data->dispatch_multi = vmulti->dispatch_width;
-      prog_data->max_polygons = vmulti->max_polygons;
-      g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
-                      vmulti->performance_analysis.require(),
-                      stats, vmulti->max_polygons);
-      stats = stats ? stats + 1 : NULL;
-      max_dispatch_width = vmulti->dispatch_width;
-
-   } else if (simd8_cfg) {
+   if (simd8_cfg) {
       prog_data->dispatch_8 = true;
       g.generate_code(simd8_cfg, 8, v8->shader_stats,
                       v8->performance_analysis.require(), stats, 1);
diff --git a/src/intel/compiler/elk/elk_fs.h b/src/intel/compiler/elk/elk_fs.h
index e97cd7e0de2..7c5642cc52d 100644
--- a/src/intel/compiler/elk/elk_fs.h
+++ b/src/intel/compiler/elk/elk_fs.h
@@ -213,7 +213,6 @@ public:
    void allocate_registers(bool allow_spilling);
    uint32_t compute_max_register_pressure();
    void fixup_3src_null_dest();
-   void emit_dummy_mov_instruction();
    bool fixup_nomask_control_flow();
    void assign_curb_setup();
    void assign_urb_setup();
diff --git a/src/intel/compiler/elk/elk_shader.cpp b/src/intel/compiler/elk/elk_shader.cpp
index 276b7c0284f..d0636efb3a9 100644
--- a/src/intel/compiler/elk/elk_shader.cpp
+++ b/src/intel/compiler/elk/elk_shader.cpp
@@ -1343,7 +1343,7 @@ elk_compile_tes(const struct elk_compiler *compiler,
    }
 
    if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+      const unsigned dispatch_width = 8;
       elk_fs_visitor v(compiler, &params->base, &key->base,
                    &prog_data->base.base, nir, dispatch_width,
                    params->base.stats != NULL, debug_enabled);
diff --git a/src/intel/compiler/elk/elk_simd_selection.cpp b/src/intel/compiler/elk/elk_simd_selection.cpp
index 08815e4a27c..04b87c522c2 100644
--- a/src/intel/compiler/elk/elk_simd_selection.cpp
+++ b/src/intel/compiler/elk/elk_simd_selection.cpp
@@ -102,8 +102,7 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd)
 
          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
 
-         const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
-         if (simd > min_simd && state.compiled[simd - 1] &&
+         if (simd > 0 && state.compiled[simd - 1] &&
             workgroup_size <= (width / 2)) {
             state.error[simd] = "Workgroup size already fits in smaller SIMD";
             return false;
@@ -119,7 +118,7 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd)
        *
        * TODO: Use performance_analysis and drop this rule.
        */
-      if (width == 32 && state.devinfo->ver < 20) {
+      if (width == 32) {
          if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
             state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
             return false;
@@ -127,11 +126,6 @@ elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd)
       }
    }
 
-   if (width == 8 && state.devinfo->ver >= 20) {
-      state.error[simd] = "SIMD8 not supported on Xe2+";
-      return false;
-   }
-
    uint64_t start;
    switch (prog_data->stage) {
    case MESA_SHADER_COMPUTE:
diff --git a/src/intel/compiler/elk/elk_vec4.cpp b/src/intel/compiler/elk/elk_vec4.cpp
index a51722e07e7..d6a24d23ccb 100644
--- a/src/intel/compiler/elk/elk_vec4.cpp
+++ b/src/intel/compiler/elk/elk_vec4.cpp
@@ -2648,7 +2648,7 @@ elk_compile_vs(const struct elk_compiler *compiler,
    }
 
    if (is_scalar) {
-      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
+      const unsigned dispatch_width = 8;
       prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
 
       elk_fs_visitor v(compiler, &params->base, &key->base,
diff --git a/src/intel/compiler/elk/elk_vec4_tcs.cpp b/src/intel/compiler/elk/elk_vec4_tcs.cpp
index be28602c0c6..f1efa660f71 100644
--- a/src/intel/compiler/elk/elk_vec4_tcs.cpp
+++ b/src/intel/compiler/elk/elk_vec4_tcs.cpp
@@ -447,7 +447,7 @@ elk_compile_tcs(const struct elk_compiler *compiler,
    }
 
    if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+      const unsigned dispatch_width = 8;
       elk_fs_visitor v(compiler, &params->base, &key->base,
                    &prog_data->base.base, nir, dispatch_width,
                    params->base.stats != NULL, debug_enabled);