From fdb029fe1b40a8d86b3fec5e2800a0bf81e065b0 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Fri, 12 Jul 2024 14:20:57 -0700
Subject: [PATCH] intel/brw: Move and reduce scope of run_*() functions

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30169>
---
 src/intel/compiler/brw_compile_bs.cpp   |  32 +-
 src/intel/compiler/brw_compile_cs.cpp   |  42 ++-
 src/intel/compiler/brw_compile_fs.cpp   | 117 ++++++-
 src/intel/compiler/brw_compile_gs.cpp   |  53 +++-
 src/intel/compiler/brw_compile_mesh.cpp |  36 ++-
 src/intel/compiler/brw_compile_tcs.cpp  |  59 +++-
 src/intel/compiler/brw_compile_tes.cpp  |  32 +-
 src/intel/compiler/brw_compile_vs.cpp   |  32 +-
 src/intel/compiler/brw_fs.cpp           | 394 ------------------------
 src/intel/compiler/brw_fs.h             |   9 -
 10 files changed, 389 insertions(+), 417 deletions(-)
diff --git a/src/intel/compiler/brw_compile_bs.cpp b/src/intel/compiler/brw_compile_bs.cpp
index 83d9f020194..2d6a3c54e4a 100644
--- a/src/intel/compiler/brw_compile_bs.cpp
+++ b/src/intel/compiler/brw_compile_bs.cpp
@@ -28,6 +28,36 @@ brw_bsr(const struct intel_device_info *devinfo,
           SET_BITS(local_arg_offset / 8, 2, 0);
 }
 
+static bool
+run_bs(fs_visitor &s, bool allow_spilling)
+{
+   assert(s.stage >= MESA_SHADER_RAYGEN && s.stage <= MESA_SHADER_CALLABLE);
+
+   s.payload_ = new bs_thread_payload(s);
+
+   nir_to_brw(&s);
+
+   if (s.failed)
+      return false;
+
+   /* TODO(RT): Perhaps rename this? */
+   s.emit_cs_terminate();
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(allow_spilling);
+
+   return !s.failed;
+}
+
 static uint8_t
 compile_single_bs(const struct brw_compiler *compiler,
                   struct brw_compile_bs_params *params,
@@ -78,7 +108,7 @@ compile_single_bs(const struct brw_compiler *compiler,
                                              debug_enabled);
 
       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (v[simd]->run_bs(allow_spilling)) {
+      if (run_bs(*v[simd], allow_spilling)) {
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
       } else {
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp
index 90cbfc7afd7..5dd387979d4 100644
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@@ -4,6 +4,7 @@
  */
 
 #include "brw_fs.h"
+#include "brw_fs_builder.h"
 #include "brw_fs_live_variables.h"
 #include "brw_nir.h"
 #include "brw_cfg.h"
@@ -15,6 +16,8 @@
 
 #include <memory>
 
+using namespace brw;
+
 static void
 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
 {
@@ -56,6 +59,43 @@ cs_fill_push_const_info(const struct intel_device_info *devinfo,
              prog_data->nr_params);
 }
 
+static bool
+run_cs(fs_visitor &s, bool allow_spilling)
+{
+   assert(gl_shader_stage_is_compute(s.stage));
+   const fs_builder bld = fs_builder(&s).at_end();
+
+   s.payload_ = new cs_thread_payload(s);
+
+   if (s.devinfo->platform == INTEL_PLATFORM_HSW && s.prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(brw_sr0_reg(1), BRW_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UW), 1));
+   }
+
+   nir_to_brw(&s);
+
+   if (s.failed)
+      return false;
+
+   s.emit_cs_terminate();
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(allow_spilling);
+
+   return !s.failed;
+}
+
 const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler,
                struct brw_compile_cs_params *params)
@@ -119,7 +159,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
 
       const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
 
-      if (v[simd]->run_cs(allow_spilling)) {
+      if (run_cs(*v[simd], allow_spilling)) {
          cs_fill_push_const_info(compiler->devinfo, prog_data);
 
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index 59565502221..234050a977b 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -5,6 +5,7 @@
 
 #include "brw_eu.h"
 #include "brw_fs.h"
+#include "brw_fs_builder.h"
 #include "brw_fs_live_variables.h"
 #include "brw_nir.h"
 #include "brw_cfg.h"
@@ -590,6 +591,110 @@ brw_nir_populate_wm_prog_data(nir_shader *shader,
    brw_compute_flat_inputs(prog_data, shader);
 }
 
+/* From the SKL PRM, Volume 16, Workarounds:
+ *
+ *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
+ *              only header phases (R0-R2)
+ *
+ *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
+ *       have been header only.
+ *
+ * Instead of enabling push constants one can alternatively enable one of the
+ * inputs. Here one simply chooses "layer" which shouldn't impose much
+ * overhead.
+ */
+static void
+gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
+{
+   if (wm_prog_data->num_varying_inputs)
+      return;
+
+   if (wm_prog_data->base.curb_read_length)
+      return;
+
+   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
+   wm_prog_data->num_varying_inputs = 1;
+
+   brw_compute_urb_setup_index(wm_prog_data);
+}
+
+static bool
+run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
+{
+   const struct intel_device_info *devinfo = s.devinfo;
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
+   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key;
+   const fs_builder bld = fs_builder(&s).at_end();
+   const nir_shader *nir = s.nir;
+
+   assert(s.stage == MESA_SHADER_FRAGMENT);
+
+   s.payload_ = new fs_thread_payload(s, s.source_depth_to_render_target);
+
+   if (nir->info.ray_queries > 0)
+      s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
+
+   if (do_rep_send) {
+      assert(s.dispatch_width == 16);
+      s.emit_repclear_shader();
+   } else {
+      if (nir->info.inputs_read > 0 ||
+          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
+          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
+         s.emit_interpolation_setup();
+      }
+
+      /* We handle discards by keeping track of the still-live pixels in f0.1.
+       * Initialize it with the dispatched pixels.
+       */
+      if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
+         const unsigned lower_width = MIN2(s.dispatch_width, 16);
+         for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) {
+            /* According to the "PS Thread Payload for Normal
+             * Dispatch" pages on the BSpec, the dispatch mask is
+             * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
+             * gfx6+.
+             */
+            const brw_reg dispatch_mask =
+               devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
+                                    brw_vec1_grf(i + 1, 7);
+            bld.exec_all().group(1, 0)
+               .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
+                    retype(dispatch_mask, BRW_TYPE_UW));
+         }
+      }
+
+      if (nir->info.writes_memory)
+         wm_prog_data->has_side_effects = true;
+
+      nir_to_brw(&s);
+
+      if (s.failed)
+	 return false;
+
+      s.emit_fb_writes();
+
+      s.calculate_cfg();
+
+      brw_fs_optimize(s);
+
+      s.assign_curb_setup();
+
+      if (devinfo->ver == 9)
+         gfx9_ps_header_only_workaround(wm_prog_data);
+
+      s.assign_urb_setup();
+
+      brw_fs_lower_3src_null_dest(s);
+      brw_fs_workaround_memory_fence_before_eot(s);
+      brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+      s.allocate_registers(allow_spilling);
+   }
+
+   return !s.failed;
+}
+
 const unsigned *
 brw_compile_fs(const struct brw_compiler *compiler,
                struct brw_compile_fs_params *params)
@@ -644,7 +749,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                         prog_data, nir, 8, 1,
                                         params->base.stats != NULL,
                                         debug_enabled);
-      if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
+      if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
                                                 v8->fail_msg);
          return NULL;
@@ -680,7 +785,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                          debug_enabled);
       if (v8)
          v16->import_uniforms(v8.get());
-      if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
+      if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
          brw_shader_perf_log(compiler, params->base.log_data,
                              "SIMD16 shader failed to compile: %s\n",
                              v16->fail_msg);
@@ -715,7 +820,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
       else if (v16)
          v32->import_uniforms(v16.get());
 
-      if (!v32->run_fs(allow_spilling, false)) {
+      if (!run_fs(*v32, allow_spilling, false)) {
          brw_shader_perf_log(compiler, params->base.log_data,
                              "SIMD32 shader failed to compile: %s\n",
                              v32->fail_msg);
@@ -752,7 +857,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                params->base.stats != NULL,
                                                debug_enabled);
          vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(false, params->use_rep_send)) {
+         if (!run_fs(*vmulti, false, params->use_rep_send)) {
             brw_shader_perf_log(compiler, params->base.log_data,
                                 "Quad-SIMD8 shader failed to compile: %s\n",
                                 vmulti->fail_msg);
@@ -772,7 +877,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                params->base.stats != NULL,
                                                debug_enabled);
          vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(false, params->use_rep_send)) {
+         if (!run_fs(*vmulti, false, params->use_rep_send)) {
             brw_shader_perf_log(compiler, params->base.log_data,
                                 "Dual-SIMD16 shader failed to compile: %s\n",
                                 vmulti->fail_msg);
@@ -791,7 +896,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
                                                params->base.stats != NULL,
                                                debug_enabled);
          vmulti->import_uniforms(vbase);
-         if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
+         if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
             brw_shader_perf_log(compiler, params->base.log_data,
                                 "Dual-SIMD8 shader failed to compile: %s\n",
                                 vmulti->fail_msg);
diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp
index 69055e58503..d8b645d21d0 100644
--- a/src/intel/compiler/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw_compile_gs.cpp
@@ -5,11 +5,14 @@
 
 #include "brw_eu.h"
 #include "brw_fs.h"
+#include "brw_fs_builder.h"
 #include "brw_prim.h"
 #include "brw_nir.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
 
+using namespace brw;
+
 static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
    [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
    [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
@@ -27,6 +30,54 @@ static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
    [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
 };
 
+static bool
+run_gs(fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_GEOMETRY);
+
+   s.payload_ = new gs_thread_payload(s);
+
+   const fs_builder bld = fs_builder(&s).at_end();
+
+   s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);
+
+   if (s.gs_compile->control_data_header_size_bits > 0) {
+      /* Create a VGRF to store accumulated control data bits. */
+      s.control_data_bits = bld.vgrf(BRW_TYPE_UD);
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (s.gs_compile->control_data_header_size_bits <= 32) {
+         const fs_builder abld = bld.annotate("initialize control data bits");
+         abld.MOV(s.control_data_bits, brw_imm_ud(0u));
+      }
+   }
+
+   nir_to_brw(&s);
+
+   s.emit_gs_thread_end();
+
+   if (s.failed)
+      return false;
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+   s.assign_gs_urb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(true /* allow_spilling */);
+
+   return !s.failed;
+}
+
 extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler,
                struct brw_compile_gs_params *params)
@@ -244,7 +295,7 @@ brw_compile_gs(const struct brw_compiler *compiler,
 
    fs_visitor v(compiler, &params->base, &c, prog_data, nir,
                 params->base.stats != NULL, debug_enabled);
-   if (v.run_gs()) {
+   if (run_gs(v)) {
       prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
 
       assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp
index 3095613ada4..0a272ea8d33 100644
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@@ -259,6 +259,38 @@ brw_nir_align_launch_mesh_workgroups(nir_shader *nir)
                                        NULL);
 }
 
+static bool
+run_task_mesh(fs_visitor &s, bool allow_spilling)
+{
+   assert(s.stage == MESA_SHADER_TASK ||
+          s.stage == MESA_SHADER_MESH);
+
+   s.payload_ = new task_mesh_thread_payload(s);
+
+   nir_to_brw(&s);
+
+   if (s.failed)
+      return false;
+
+   s.emit_urb_fence();
+
+   s.emit_cs_terminate();
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(allow_spilling);
+
+   return !s.failed;
+}
+
 const unsigned *
 brw_compile_task(const struct brw_compiler *compiler,
                  struct brw_compile_task_params *params)
@@ -331,7 +363,7 @@ brw_compile_task(const struct brw_compiler *compiler,
       }
 
       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (v[simd]->run_task(allow_spilling))
+      if (run_task_mesh(*v[simd], allow_spilling))
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
       else
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
@@ -1621,7 +1653,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
       }
 
       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
-      if (v[simd]->run_mesh(allow_spilling))
+      if (run_task_mesh(*v[simd], allow_spilling))
          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
       else
          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp
index cfee80652a9..2502778f8b5 100644
--- a/src/intel/compiler/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw_compile_tcs.cpp
@@ -7,9 +7,12 @@
 #include "intel_nir.h"
 #include "brw_nir.h"
 #include "brw_fs.h"
+#include "brw_fs_builder.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
 
+using namespace brw;
+
 /**
  * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
  * launched.  In cases with a large number of input control points and a large
@@ -39,6 +42,60 @@ get_patch_count_threshold(int input_control_points)
    return 1;
 }
 
+static bool
+run_tcs(fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);
+   const fs_builder bld = fs_builder(&s).at_end();
+
+   assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
+          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
+
+   s.payload_ = new tcs_thread_payload(s);
+
+   /* Initialize gl_InvocationID */
+   s.set_tcs_invocation_id();
+
+   const bool fix_dispatch_mask =
+      vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
+      (s.nir->info.tess.tcs_vertices_out % 8) != 0;
+
+   /* Fix the disptach mask */
+   if (fix_dispatch_mask) {
+      bld.CMP(bld.null_reg_ud(), s.invocation_id,
+              brw_imm_ud(s.nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   nir_to_brw(&s);
+
+   if (fix_dispatch_mask) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   s.emit_tcs_thread_end();
+
+   if (s.failed)
+      return false;
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+   s.assign_tcs_urb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(true /* allow_spilling */);
+
+   return !s.failed;
+}
+
 extern "C" const unsigned *
 brw_compile_tcs(const struct brw_compiler *compiler,
                 struct brw_compile_tcs_params *params)
@@ -136,7 +193,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    fs_visitor v(compiler, &params->base, &key->base,
                 &prog_data->base.base, nir, dispatch_width,
                 params->base.stats != NULL, debug_enabled);
-   if (!v.run_tcs()) {
+   if (!run_tcs(v)) {
       params->base.error_str =
          ralloc_strdup(params->base.mem_ctx, v.fail_msg);
       return NULL;
diff --git a/src/intel/compiler/brw_compile_tes.cpp b/src/intel/compiler/brw_compile_tes.cpp
index c8baca58cb1..f22c51c9f52 100644
--- a/src/intel/compiler/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw_compile_tes.cpp
@@ -11,6 +11,36 @@
 #include "dev/intel_debug.h"
 #include "util/macros.h"
 
+static bool
+run_tes(fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_TESS_EVAL);
+
+   s.payload_ = new tes_thread_payload(s);
+
+   nir_to_brw(&s);
+
+   if (s.failed)
+      return false;
+
+   s.emit_urb_writes();
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+   s.assign_tes_urb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(true /* allow_spilling */);
+
+   return !s.failed;
+}
+
 const unsigned *
 brw_compile_tes(const struct brw_compiler *compiler,
                 brw_compile_tes_params *params)
@@ -109,7 +139,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
    fs_visitor v(compiler, &params->base, &key->base,
                 &prog_data->base.base, nir, dispatch_width,
                 params->base.stats != NULL, debug_enabled);
-   if (!v.run_tes()) {
+   if (!run_tes(v)) {
       params->base.error_str =
          ralloc_strdup(params->base.mem_ctx, v.fail_msg);
       return NULL;
diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp
index 5f103c65e4a..825e9a9af37 100644
--- a/src/intel/compiler/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw_compile_vs.cpp
@@ -11,6 +11,36 @@
 
 using namespace brw;
 
+static bool
+run_vs(fs_visitor &s)
+{
+   assert(s.stage == MESA_SHADER_VERTEX);
+
+   s.payload_ = new vs_thread_payload(s);
+
+   nir_to_brw(&s);
+
+   if (s.failed)
+      return false;
+
+   s.emit_urb_writes();
+
+   s.calculate_cfg();
+
+   brw_fs_optimize(s);
+
+   s.assign_curb_setup();
+   s.assign_vs_urb_setup();
+
+   brw_fs_lower_3src_null_dest(s);
+   brw_fs_workaround_memory_fence_before_eot(s);
+   brw_fs_workaround_emit_dummy_mov_instruction(s);
+
+   s.allocate_registers(true /* allow_spilling */);
+
+   return !s.failed;
+}
+
 extern "C" const unsigned *
 brw_compile_vs(const struct brw_compiler *compiler,
                struct brw_compile_vs_params *params)
@@ -102,7 +132,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
    fs_visitor v(compiler, &params->base, &key->base,
                 &prog_data->base.base, nir, dispatch_width,
                 params->base.stats != NULL, debug_enabled);
-   if (!v.run_vs()) {
+   if (!run_vs(v)) {
       params->base.error_str =
          ralloc_strdup(params->base.mem_ctx, v.fail_msg);
       return NULL;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 36d5679244b..dab28efaefa 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2709,36 +2709,6 @@ fs_visitor::allocate_registers(bool allow_spilling)
    brw_fs_lower_scoreboard(*this);
 }
 
-bool
-fs_visitor::run_vs()
-{
-   assert(stage == MESA_SHADER_VERTEX);
-
-   payload_ = new vs_thread_payload(*this);
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   emit_urb_writes();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-   assign_vs_urb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(true /* allow_spilling */);
-
-   return !failed;
-}
-
 void
 fs_visitor::set_tcs_invocation_id()
 {
@@ -2811,370 +2781,6 @@ fs_visitor::emit_tcs_thread_end()
    inst->eot = true;
 }
 
-bool
-fs_visitor::run_tcs()
-{
-   assert(stage == MESA_SHADER_TESS_CTRL);
-
-   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
-   const fs_builder bld = fs_builder(this).at_end();
-
-   assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
-          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
-
-   payload_ = new tcs_thread_payload(*this);
-
-   /* Initialize gl_InvocationID */
-   set_tcs_invocation_id();
-
-   const bool fix_dispatch_mask =
-      vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
-      (nir->info.tess.tcs_vertices_out % 8) != 0;
-
-   /* Fix the disptach mask */
-   if (fix_dispatch_mask) {
-      bld.CMP(bld.null_reg_ud(), invocation_id,
-              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
-      bld.IF(BRW_PREDICATE_NORMAL);
-   }
-
-   nir_to_brw(this);
-
-   if (fix_dispatch_mask) {
-      bld.emit(BRW_OPCODE_ENDIF);
-   }
-
-   emit_tcs_thread_end();
-
-   if (failed)
-      return false;
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-   assign_tcs_urb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(true /* allow_spilling */);
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_tes()
-{
-   assert(stage == MESA_SHADER_TESS_EVAL);
-
-   payload_ = new tes_thread_payload(*this);
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   emit_urb_writes();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-   assign_tes_urb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(true /* allow_spilling */);
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_gs()
-{
-   assert(stage == MESA_SHADER_GEOMETRY);
-
-   payload_ = new gs_thread_payload(*this);
-
-   const fs_builder bld = fs_builder(this).at_end();
-
-   this->final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);
-
-   if (gs_compile->control_data_header_size_bits > 0) {
-      /* Create a VGRF to store accumulated control data bits. */
-      this->control_data_bits = bld.vgrf(BRW_TYPE_UD);
-
-      /* If we're outputting more than 32 control data bits, then EmitVertex()
-       * will set control_data_bits to 0 after emitting the first vertex.
-       * Otherwise, we need to initialize it to 0 here.
-       */
-      if (gs_compile->control_data_header_size_bits <= 32) {
-         const fs_builder abld = bld.annotate("initialize control data bits");
-         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
-      }
-   }
-
-   nir_to_brw(this);
-
-   emit_gs_thread_end();
-
-   if (failed)
-      return false;
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-   assign_gs_urb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(true /* allow_spilling */);
-
-   return !failed;
-}
-
-/* From the SKL PRM, Volume 16, Workarounds:
- *
- *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
- *              only header phases (R0-R2)
- *
- *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
- *       have been header only.
- *
- * Instead of enabling push constants one can alternatively enable one of the
- * inputs. Here one simply chooses "layer" which shouldn't impose much
- * overhead.
- */
-static void
-gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
-{
-   if (wm_prog_data->num_varying_inputs)
-      return;
-
-   if (wm_prog_data->base.curb_read_length)
-      return;
-
-   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
-   wm_prog_data->num_varying_inputs = 1;
-
-   brw_compute_urb_setup_index(wm_prog_data);
-}
-
-bool
-fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
-{
-   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
-   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
-   const fs_builder bld = fs_builder(this).at_end();
-
-   assert(stage == MESA_SHADER_FRAGMENT);
-
-   payload_ = new fs_thread_payload(*this, source_depth_to_render_target);
-
-   if (nir->info.ray_queries > 0)
-      limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
-
-   if (do_rep_send) {
-      assert(dispatch_width == 16);
-      emit_repclear_shader();
-   } else {
-      if (nir->info.inputs_read > 0 ||
-          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
-          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
-         emit_interpolation_setup();
-      }
-
-      /* We handle discards by keeping track of the still-live pixels in f0.1.
-       * Initialize it with the dispatched pixels.
-       */
-      if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
-         const unsigned lower_width = MIN2(dispatch_width, 16);
-         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
-            /* According to the "PS Thread Payload for Normal
-             * Dispatch" pages on the BSpec, the dispatch mask is
-             * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
-             * gfx6+.
-             */
-            const brw_reg dispatch_mask =
-               devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
-                                    brw_vec1_grf(i + 1, 7);
-            bld.exec_all().group(1, 0)
-               .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
-                    retype(dispatch_mask, BRW_TYPE_UW));
-         }
-      }
-
-      if (nir->info.writes_memory)
-         wm_prog_data->has_side_effects = true;
-
-      nir_to_brw(this);
-
-      if (failed)
-	 return false;
-
-      emit_fb_writes();
-
-      calculate_cfg();
-
-      brw_fs_optimize(*this);
-
-      assign_curb_setup();
-
-      if (devinfo->ver == 9)
-         gfx9_ps_header_only_workaround(wm_prog_data);
-
-      assign_urb_setup();
-
-      brw_fs_lower_3src_null_dest(*this);
-      brw_fs_workaround_memory_fence_before_eot(*this);
-      brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-      allocate_registers(allow_spilling);
-   }
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_cs(bool allow_spilling)
-{
-   assert(gl_shader_stage_is_compute(stage));
-   const fs_builder bld = fs_builder(this).at_end();
-
-   payload_ = new cs_thread_payload(*this);
-
-   if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
-      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
-      const fs_builder abld = bld.exec_all().group(1, 0);
-      abld.MOV(retype(brw_sr0_reg(1), BRW_TYPE_UW),
-               suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UW), 1));
-   }
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   emit_cs_terminate();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(allow_spilling);
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_bs(bool allow_spilling)
-{
-   assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
-
-   payload_ = new bs_thread_payload(*this);
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   /* TODO(RT): Perhaps rename this? */
-   emit_cs_terminate();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(allow_spilling);
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_task(bool allow_spilling)
-{
-   assert(stage == MESA_SHADER_TASK);
-
-   payload_ = new task_mesh_thread_payload(*this);
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   emit_urb_fence();
-
-   emit_cs_terminate();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(allow_spilling);
-
-   return !failed;
-}
-
-bool
-fs_visitor::run_mesh(bool allow_spilling)
-{
-   assert(stage == MESA_SHADER_MESH);
-
-   payload_ = new task_mesh_thread_payload(*this);
-
-   nir_to_brw(this);
-
-   if (failed)
-      return false;
-
-   emit_urb_fence();
-
-   emit_cs_terminate();
-
-   calculate_cfg();
-
-   brw_fs_optimize(*this);
-
-   assign_curb_setup();
-
-   brw_fs_lower_3src_null_dest(*this);
-   brw_fs_workaround_memory_fence_before_eot(*this);
-   brw_fs_workaround_emit_dummy_mov_instruction(*this);
-
-   allocate_registers(allow_spilling);
-
-   return !failed;
-}
-
-
 /**
  * Move load_interpolated_input with simple (payload-based) barycentric modes
  * to the top of the program so we don't emit multiple PLNs for the same input.
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 2a1cbaa095e..40f48864034 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -301,15 +301,6 @@ public:
                                    uint8_t alignment,
                                    unsigned components);
 
-   bool run_fs(bool allow_spilling, bool do_rep_send);
-   bool run_vs();
-   bool run_tcs();
-   bool run_tes();
-   bool run_gs();
-   bool run_cs(bool allow_spilling);
-   bool run_bs(bool allow_spilling);
-   bool run_task(bool allow_spilling);
-   bool run_mesh(bool allow_spilling);
    void allocate_registers(bool allow_spilling);
    uint32_t compute_max_register_pressure();
    void assign_curb_setup();