From 7c23b90537e359853d8ecd11c275e13ff23b0469 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Wed, 14 Feb 2024 22:41:17 -0800
Subject: [PATCH] intel/brw: Always use scalar shaders

Remove scalar_stage[] array, since now it is always scalar.  This
removes any usage of vec4 shaders in brw.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
---
 src/intel/compiler/brw_compile_gs.cpp         | 149 +++-----------
 src/intel/compiler/brw_compile_tcs.cpp        |  81 +++-----
 src/intel/compiler/brw_compile_vs.cpp         |  90 +++------
 src/intel/compiler/brw_compiler.c             | 122 ++++--------
 src/intel/compiler/brw_compiler.h             |   1 -
 src/intel/compiler/brw_nir.c                  | 181 +++++++-----------
 src/intel/compiler/brw_nir.h                  |   3 +-
 src/intel/compiler/brw_nir_rt.c               |   3 +-
 src/intel/compiler/brw_shader.cpp             |  81 +++-----
 src/intel/compiler/brw_shader.h               |  14 +-
 src/intel/vulkan/anv_device.c                 |   8 +-
 .../vulkan/anv_nir_compute_push_layout.c      |  14 +-
 12 files changed, 224 insertions(+), 523 deletions(-)

diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp
index a63b34de831..19728e2d5c2 100644
--- a/src/intel/compiler/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw_compile_gs.cpp
@@ -3,8 +3,6 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "brw_vec4_gs_visitor.h"
-#include "gfx6_gs_visitor.h"
 #include "brw_eu.h"
 #include "brw_fs.h"
 #include "brw_prim.h"
@@ -41,7 +39,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
    memset(&c, 0, sizeof(c));
    c.key = *key;
 
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
 
    prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
@@ -266,135 +263,33 @@ brw_compile_gs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
    }
 
-   if (is_scalar) {
-      fs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                   params->base.stats != NULL, debug_enabled);
-      if (v.run_gs()) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+   fs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                params->base.stats != NULL, debug_enabled);
+   if (v.run_gs()) {
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
 
-         assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-         prog_data->base.base.dispatch_grf_start_reg =
-            v.payload().num_regs / reg_unit(compiler->devinfo);
+      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg =
+         v.payload().num_regs / reg_unit(compiler->devinfo);
 
-         fs_generator g(compiler, &params->base,
-                        &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
-         if (unlikely(debug_enabled)) {
-            const char *label =
-               nir->info.label ? nir->info.label : "unnamed";
-            char *name = ralloc_asprintf(params->base.mem_ctx,
-                                         "%s geometry shader %s",
-                                         label, nir->info.name);
-            g.enable_debug(name);
-         }
-         g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
-                         v.performance_analysis.require(), params->base.stats);
-         g.add_const_data(nir->constant_data, nir->constant_data_size);
-         return g.get_assembly();
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
+      if (unlikely(debug_enabled)) {
+         const char *label =
+            nir->info.label ? nir->info.label : "unnamed";
+         char *name = ralloc_asprintf(params->base.mem_ctx,
+                                      "%s geometry shader %s",
+                                      label, nir->info.name);
+         g.enable_debug(name);
       }
-
-      params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-
-      return NULL;
+      g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+      return g.get_assembly();
    }
 
-   if (compiler->devinfo->ver >= 7) {
-      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
-       * so without spilling. If the GS invocations count > 1, then we can't use
-       * dual object mode.
-       */
-      if (prog_data->invocations <= 1 &&
-          !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
 
-         brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                                true /* no_spills */,
-                                debug_enabled);
-
-         /* Backup 'nr_params' and 'param' as they can be modified by the
-          * the DUAL_OBJECT visitor. If it fails, we will run the fallback
-          * (DUAL_INSTANCED or SINGLE mode) and we need to restore original
-          * values.
-          */
-         const unsigned param_count = prog_data->base.base.nr_params;
-         uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
-         memcpy(param, prog_data->base.base.param,
-                sizeof(uint32_t) * param_count);
-
-         if (v.run()) {
-            /* Success! Backup is not needed */
-            ralloc_free(param);
-            return brw_vec4_generate_assembly(compiler, &params->base,
-                                              nir, &prog_data->base,
-                                              v.cfg,
-                                              v.performance_analysis.require(),
-                                              debug_enabled);
-         } else {
-            /* These variables could be modified by the execution of the GS
-             * visitor if it packed the uniforms in the push constant buffer.
-             * As it failed, we need restore them so we can start again with
-             * DUAL_INSTANCED or SINGLE mode.
-             *
-             * FIXME: Could more variables be modified by this execution?
-             */
-            memcpy(prog_data->base.base.param, param,
-                   sizeof(uint32_t) * param_count);
-            prog_data->base.base.nr_params = param_count;
-            ralloc_free(param);
-         }
-      }
-   }
-
-   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
-    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
-    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
-    *
-    * FIXME: Single dispatch mode requires that the driver can handle
-    * interleaving of input registers, but this is already supported (dual
-    * instance mode has the same requirement). However, to take full advantage
-    * of single dispatch mode to reduce register pressure we would also need to
-    * do interleaved outputs, but currently, the vec4 visitor and generator
-    * classes do not support this, so at the moment register pressure in
-    * single and dual instance modes is the same.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
-    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
-    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
-    * is also supported. When InstanceCount=1 (one instance per object) software
-    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
-    * the best choice for performance, followed by SINGLE mode."
-    *
-    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
-    * mode is more performant when invocations > 1. Gfx6 only supports
-    * SINGLE mode.
-    */
-   if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
-   else
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
-
-   brw::vec4_gs_visitor *gs = NULL;
-   const unsigned *ret = NULL;
-
-   if (compiler->devinfo->ver >= 7)
-      gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-   else
-      gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-
-   if (!gs->run()) {
-      params->base.error_str =
-         ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
-   } else {
-      ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                       &prog_data->base, gs->cfg,
-                                       gs->performance_analysis.require(),
-                                       debug_enabled);
-   }
-
-   delete gs;
-   return ret;
+   return NULL;
 }
 
diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp
index adce8e38e40..31b0a4ecdae 100644
--- a/src/intel/compiler/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw_compile_tcs.cpp
@@ -3,9 +3,9 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "brw_eu.h"
 #include "intel_nir.h"
 #include "brw_nir.h"
-#include "brw_vec4_tcs.h"
 #include "brw_fs.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
@@ -49,9 +49,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    struct brw_tcs_prog_data *prog_data = params->prog_data;
    struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
 
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
-   const unsigned *assembly;
 
    vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
@@ -89,7 +87,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       prog_data->instances = nir->info.tess.tcs_vertices_out;
       prog_data->include_primitive_id = has_primitive_id;
    } else {
-      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      unsigned verts_per_thread = 8;
       vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
       prog_data->instances =
          DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
@@ -135,54 +133,33 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
    }
 
-   if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_tcs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
-      if (unlikely(debug_enabled)) {
-         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
-                                        "%s tessellation control shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-      assembly = g.get_assembly();
-   } else {
-      brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
-                              nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      if (INTEL_DEBUG(DEBUG_TCS))
-         v.dump_instructions();
-
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                            &prog_data->base, v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+   const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_tcs()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
    }
 
-   return assembly;
+   assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s tessellation control shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+   return g.get_assembly();
 }
diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp
index c1e089ec280..983c2a837ec 100644
--- a/src/intel/compiler/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw_compile_vs.cpp
@@ -3,11 +3,9 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "brw_vec4.h"
 #include "brw_fs.h"
 #include "brw_eu.h"
 #include "brw_nir.h"
-#include "brw_vec4_vs.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
 
@@ -28,11 +26,8 @@ brw_compile_vs(const struct brw_compiler *compiler,
    prog_data->base.base.ray_queries = nir->info.ray_queries;
    prog_data->base.base.total_scratch = 0;
 
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
    brw_nir_apply_key(nir, compiler, &key->base, 8);
 
-   const unsigned *assembly = NULL;
-
    prog_data->inputs_read = nir->info.inputs_read;
    prog_data->double_inputs_read = nir->info.vs.double_inputs;
 
@@ -83,17 +78,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
    if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
           prog_data->uses_drawid = true;
 
-   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
-    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
-    * vec4 mode, the hardware appears to wedge unless we read something.
-    */
-   if (is_scalar)
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(nr_attribute_slots, 2);
-   else
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
-
+   prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
    prog_data->nr_attribute_slots = nr_attribute_slots;
 
    /* Since vertex shaders reuse the same VUE entry for inputs and outputs
@@ -114,58 +99,37 @@ brw_compile_vs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
    }
 
-   if (is_scalar) {
-      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+   const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
 
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_vs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg =
-         v.payload().num_regs / reg_unit(compiler->devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, v.runtime_check_aads_emit,
-                     MESA_SHADER_VERTEX);
-      if (unlikely(debug_enabled)) {
-         const char *debug_name =
-            ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
-                            nir->info.label ? nir->info.label :
-                               "unnamed",
-                            nir->info.name);
-
-         g.enable_debug(debug_name);
-      }
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-      assembly = g.get_assembly();
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_vs()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
    }
 
-   if (!assembly) {
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg =
+      v.payload().num_regs / reg_unit(compiler->devinfo);
 
-      vec4_vs_visitor v(compiler, &params->base, key, prog_data,
-                        nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, v.runtime_check_aads_emit,
+                  MESA_SHADER_VERTEX);
+   if (unlikely(debug_enabled)) {
+      const char *debug_name =
+         ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
+                         nir->info.label ? nir->info.label :
+                            "unnamed",
+                         nir->info.name);
 
-      assembly = brw_vec4_generate_assembly(compiler, &params->base,
-                                            nir, &prog_data->base,
-                                            v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+      g.enable_debug(debug_name);
    }
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
 
-   return assembly;
+   return g.get_assembly();
 }
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index 51034e3c78c..d7eac3ca69c 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -29,77 +29,51 @@
 #include "compiler/nir/nir.h"
 #include "util/u_debug.h"
 
-#define COMMON_OPTIONS                                                        \
-   .has_uclz = true,                                                          \
-   .lower_fdiv = true,                                                        \
-   .lower_scmp = true,                                                        \
-   .lower_flrp16 = true,                                                      \
-   .lower_fmod = true,                                                        \
-   .lower_ufind_msb = true,                                                   \
-   .lower_uadd_carry = true,                                                  \
-   .lower_usub_borrow = true,                                                 \
-   .lower_flrp64 = true,                                                      \
-   .lower_fisnormal = true,                                                   \
-   .lower_isign = true,                                                       \
-   .lower_ldexp = true,                                                       \
-   .lower_bitfield_extract = true,                                            \
-   .lower_bitfield_insert = true,                                             \
-   .lower_device_index_to_zero = true,                                        \
-   .vectorize_io = true,                                                      \
-   .vectorize_tess_levels = true,                                             \
-   .use_interpolated_input_intrinsics = true,                                 \
-   .lower_insert_byte = true,                                                 \
-   .lower_insert_word = true,                                                 \
-   .vertex_id_zero_based = true,                                              \
-   .lower_base_vertex = true,                                                 \
-   .support_16bit_alu = true,                                                 \
-   .lower_uniforms_to_ubo = true
-
-#define COMMON_SCALAR_OPTIONS                                                 \
-   .lower_to_scalar = true,                                                   \
-   .lower_pack_half_2x16 = true,                                              \
-   .lower_pack_snorm_2x16 = true,                                             \
-   .lower_pack_snorm_4x8 = true,                                              \
-   .lower_pack_unorm_2x16 = true,                                             \
-   .lower_pack_unorm_4x8 = true,                                              \
-   .lower_unpack_half_2x16 = true,                                            \
-   .lower_unpack_snorm_2x16 = true,                                           \
-   .lower_unpack_snorm_4x8 = true,                                            \
-   .lower_unpack_unorm_2x16 = true,                                           \
-   .lower_unpack_unorm_4x8 = true,                                            \
-   .lower_hadd64 = true,                                                      \
-   .avoid_ternary_with_two_constants = true,                                  \
-   .has_pack_32_4x8 = true,                                                   \
-   .max_unroll_iterations = 32,                                               \
-   .force_indirect_unrolling = nir_var_function_temp,                         \
-   .divergence_analysis_options =                                             \
-      (nir_divergence_single_patch_per_tcs_subgroup |                         \
-       nir_divergence_single_patch_per_tes_subgroup |                         \
-       nir_divergence_shader_record_ptr_uniform)
-
 const struct nir_shader_compiler_options brw_scalar_nir_options = {
-   COMMON_OPTIONS,
-   COMMON_SCALAR_OPTIONS,
-};
-
-const struct nir_shader_compiler_options brw_vector_nir_options = {
-   COMMON_OPTIONS,
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all the
-    * components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    */
-   .fdot_replicates = true,
-
-   .lower_usub_sat = true,
+   .avoid_ternary_with_two_constants = true,
+   .divergence_analysis_options =
+      (nir_divergence_single_patch_per_tcs_subgroup |
+       nir_divergence_single_patch_per_tes_subgroup |
+       nir_divergence_shader_record_ptr_uniform),
+   .force_indirect_unrolling = nir_var_function_temp,
+   .has_pack_32_4x8 = true,
+   .has_uclz = true,
+   .lower_base_vertex = true,
+   .lower_bitfield_extract = true,
+   .lower_bitfield_insert = true,
+   .lower_device_index_to_zero = true,
+   .lower_fdiv = true,
+   .lower_fisnormal = true,
+   .lower_flrp16 = true,
+   .lower_flrp64 = true,
+   .lower_fmod = true,
+   .lower_hadd64 = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
+   .lower_isign = true,
+   .lower_ldexp = true,
+   .lower_pack_half_2x16 = true,
    .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
    .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_scmp = true,
+   .lower_to_scalar = true,
+   .lower_uadd_carry = true,
+   .lower_ufind_msb = true,
+   .lower_uniforms_to_ubo = true,
+   .lower_unpack_half_2x16 = true,
    .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
    .lower_unpack_unorm_2x16 = true,
-   .lower_extract_byte = true,
-   .lower_extract_word = true,
-   .intel_vec4 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_usub_borrow = true,
    .max_unroll_iterations = 32,
+   .support_16bit_alu = true,
+   .use_interpolated_input_intrinsics = true,
+   .vectorize_io = true,
+   .vectorize_tess_levels = true,
+   .vertex_id_zero_based = true,
 };
 
 struct brw_compiler *
@@ -129,15 +103,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
        devinfo->platform != INTEL_PLATFORM_ARL_H) ||
       debug_get_bool_option("INTEL_LOWER_DPAS", false);
 
-   /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
-   for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
-      compiler->scalar_stage[i] = devinfo->ver >= 8 ||
-         i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
-   }
-
-   for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
-      compiler->scalar_stage[i] = true;
-
    nir_lower_int64_options int64_options =
       nir_lower_imul64 |
       nir_lower_isign64 |
@@ -175,13 +140,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
    for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
       struct nir_shader_compiler_options *nir_options =
          rzalloc(compiler, struct nir_shader_compiler_options);
-      bool is_scalar = compiler->scalar_stage[i];
-      if (is_scalar) {
-         *nir_options = brw_scalar_nir_options;
-         int64_options |= nir_lower_usub_sat64;
-      } else {
-         *nir_options = brw_vector_nir_options;
-      }
+      *nir_options = brw_scalar_nir_options;
+      int64_options |= nir_lower_usub_sat64;
 
       /* Prior to Gfx6, there are no three source operations, and Gfx11 loses
        * LRP.
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index c7095b5cce5..0b2155a5626 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -86,7 +86,6 @@ struct brw_compiler {
    void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
    void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
 
-   bool scalar_stage[MESA_ALL_SHADER_STAGES];
    bool use_tcs_multi_patch;
    struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
 
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 5e2b87d04e0..af691f26aac 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -714,7 +714,7 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
 })
 
 void
-brw_nir_optimize(nir_shader *nir, bool is_scalar,
+brw_nir_optimize(nir_shader *nir,
                  const struct intel_device_info *devinfo)
 {
    bool progress;
@@ -752,18 +752,11 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
       OPT(nir_opt_ray_queries);
       OPT(nir_opt_ray_query_ranges);
 
-      if (is_scalar) {
-         OPT(nir_lower_alu_to_scalar, NULL, NULL);
-      } else {
-         OPT(nir_opt_shrink_stores, true);
-         OPT(nir_opt_shrink_vectors);
-      }
+      OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
       OPT(nir_copy_prop);
 
-      if (is_scalar) {
-         OPT(nir_lower_phis_to_scalar, false);
-      }
+      OPT(nir_lower_phis_to_scalar, false);
 
       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
@@ -784,15 +777,9 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
        * For indirect loads of uniforms (push constants), we assume that array
        * indices will nearly always be in bounds and the cost of the load is
        * low.  Therefore there shouldn't be a performance benefit to avoid it.
-       * However, in vec4 tessellation shaders, these loads operate by
-       * actually pulling from memory.
        */
-      const bool is_vec4_tessellation = !is_scalar &&
-         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
-          nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
-          devinfo->ver >= 6);
+      OPT(nir_opt_peephole_select, 0, true, false);
+      OPT(nir_opt_peephole_select, 8, true, devinfo->ver >= 6);
 
       OPT(nir_opt_intrinsics);
       OPT(nir_opt_idiv_const, 32);
@@ -1014,15 +1001,11 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
    const struct intel_device_info *devinfo = compiler->devinfo;
    UNUSED bool progress; /* Written by OPT */
 
-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
-
    nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
 
    OPT(nir_lower_frexp);
 
-   if (is_scalar) {
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
-   }
+   OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
    if (nir->info.stage == MESA_SHADER_GEOMETRY)
       OPT(nir_lower_gs_intrinsics, 0);
@@ -1081,7 +1064,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
    OPT(nir_split_var_copies);
    OPT(nir_split_struct_vars, nir_var_function_temp);
 
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);
 
    OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
    if (OPT(nir_lower_int64_float_conversions)) {
@@ -1102,9 +1085,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
       OPT(nir_opt_large_constants, NULL, 32);
    }
 
-   if (is_scalar) {
-      OPT(nir_lower_load_const_to_scalar);
-   }
+   OPT(nir_lower_load_const_to_scalar);
 
    OPT(nir_lower_system_values);
    nir_lower_compute_system_values_options lower_csv_options = {
@@ -1116,7 +1097,6 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
       .ballot_bit_size = 32,
       .ballot_components = 1,
       .lower_to_scalar = true,
-      .lower_vote_trivial = !is_scalar,
       .lower_relative_shuffle = true,
       .lower_quad_broadcast_dynamic = true,
       .lower_elect = true,
@@ -1142,7 +1122,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
     * issues are helped but nothing else in shader-db is hurt except for maybe
     * that one kerbal space program shader.
     */
-   if (is_scalar && !(indirect_mask & nir_var_function_temp))
+   if (!(indirect_mask & nir_var_function_temp))
       OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
 
    /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
@@ -1165,7 +1145,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
       OPT(intel_nir_clamp_per_vertex_loads);
 
    /* Get rid of split copies */
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);
 }
 
 static bool
@@ -1321,18 +1301,13 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
    nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
    nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
 
-   const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
-   const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
-
-   if (p_is_scalar && c_is_scalar) {
-      NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
-      NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
-      brw_nir_optimize(producer, p_is_scalar, devinfo);
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
-   }
+   NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
+   NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
+   brw_nir_optimize(producer, devinfo);
+   brw_nir_optimize(consumer, devinfo);
 
    if (nir_link_opt_varyings(producer, consumer))
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+      brw_nir_optimize(consumer, devinfo);
 
    NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
    NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
@@ -1361,8 +1336,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
                   brw_nir_no_indirect_mask(compiler, consumer->info.stage),
                   UINT32_MAX);
 
-      brw_nir_optimize(producer, p_is_scalar, devinfo);
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+      brw_nir_optimize(producer, devinfo);
+      brw_nir_optimize(consumer, devinfo);
 
       if (producer->info.stage == MESA_SHADER_MESH &&
             consumer->info.stage == MESA_SHADER_FRAGMENT) {
@@ -1591,48 +1566,45 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
                                enum brw_robustness_flags robust_flags)
 {
    bool progress = false;
-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
 
-   if (is_scalar) {
-      nir_load_store_vectorize_options options = {
-         .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
-                  nir_var_mem_global | nir_var_mem_shared |
-                  nir_var_mem_task_payload,
-         .callback = brw_nir_should_vectorize_mem,
-         .robust_modes = (nir_variable_mode)0,
-      };
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
+               nir_var_mem_global | nir_var_mem_shared |
+               nir_var_mem_task_payload,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };
 
-      if (robust_flags & BRW_ROBUSTNESS_UBO)
-         options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
-      if (robust_flags & BRW_ROBUSTNESS_SSBO)
-         options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
+   if (robust_flags & BRW_ROBUSTNESS_UBO)
+      options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
+   if (robust_flags & BRW_ROBUSTNESS_SSBO)
+      options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
 
-      OPT(nir_opt_load_store_vectorize, &options);
+   OPT(nir_opt_load_store_vectorize, &options);
 
-      /* Only run the blockify optimization on Gfx9+ because although prior HW
-       * versions have support for block loads, they do have limitations on
-       * alignment as well as requiring split sends which are not supported
-       * there.
+   /* Only run the blockify optimization on Gfx9+ because although prior HW
+    * versions have support for block loads, they do have limitations on
+    * alignment as well as requiring split sends which are not supported
+    * there.
+    */
+   if (compiler->devinfo->ver >= 9) {
+      /* Required for nir_divergence_analysis() */
+      OPT(nir_convert_to_lcssa, true, true);
+
+      /* When HW supports block loads, using the divergence analysis, try
+       * to find uniform SSBO loads and turn them into block loads.
+       *
+       * Rerun the vectorizer after that to make the largest possible block
+       * loads.
+       *
+       * This is a win on 2 fronts :
+       *   - fewer send messages
+       *   - reduced register pressure
        */
-      if (compiler->devinfo->ver >= 9) {
-         /* Required for nir_divergence_analysis() */
-         OPT(nir_convert_to_lcssa, true, true);
-
-         /* When HW supports block loads, using the divergence analysis, try
-          * to find uniform SSBO loads and turn them into block loads.
-          *
-          * Rerun the vectorizer after that to make the largest possible block
-          * loads.
-          *
-          * This is a win on 2 fronts :
-          *   - fewer send messages
-          *   - reduced register pressure
-          */
-         nir_divergence_analysis(nir);
-         if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
-            OPT(nir_opt_load_store_vectorize, &options);
-         OPT(nir_opt_remove_phis);
-      }
+      nir_divergence_analysis(nir);
+      if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
+         OPT(nir_opt_load_store_vectorize, &options);
+      OPT(nir_opt_remove_phis);
    }
 
    nir_lower_mem_access_bit_sizes_options mem_access_options = {
@@ -1683,7 +1655,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
                     enum brw_robustness_flags robust_flags)
 {
    const struct intel_device_info *devinfo = compiler->devinfo;
-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
 
    UNUSED bool progress; /* Written by OPT */
 
@@ -1710,20 +1681,20 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
    if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
       NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
 
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);
 
-   if (is_scalar && nir_shader_has_local_variables(nir)) {
+   if (nir_shader_has_local_variables(nir)) {
       OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
           glsl_get_natural_size_align_bytes);
       OPT(nir_lower_explicit_io, nir_var_function_temp,
           nir_address_format_32bit_offset);
-      brw_nir_optimize(nir, is_scalar, devinfo);
+      brw_nir_optimize(nir, devinfo);
    }
 
    brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
 
    if (OPT(nir_lower_int64))
-      brw_nir_optimize(nir, is_scalar, devinfo);
+      brw_nir_optimize(nir, devinfo);
 
    if (devinfo->ver >= 6) {
       /* Try and fuse multiply-adds, if successful, run shrink_vectors to
@@ -1741,8 +1712,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
          OPT(nir_opt_shrink_vectors);
    }
 
-   if (is_scalar)
-      OPT(intel_nir_opt_peephole_imul32x16);
+   OPT(intel_nir_opt_peephole_imul32x16);
 
    if (OPT(nir_opt_comparison_pre)) {
       OPT(nir_copy_prop);
@@ -1753,27 +1723,15 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
        * the other optimization passes) will have removed at least one
        * instruction from one of the branches of the if-statement, so now it
        * might be under the threshold of conversion to bcsel.
-       *
-       * See brw_nir_optimize for the explanation of is_vec4_tessellation.
        */
-      const bool is_vec4_tessellation = !is_scalar &&
-         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
-          nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
-          compiler->devinfo->ver >= 6);
+      OPT(nir_opt_peephole_select, 0, false, false);
+      OPT(nir_opt_peephole_select, 1, false, compiler->devinfo->ver >= 6);
    }
 
    do {
       progress = false;
       if (OPT(nir_opt_algebraic_late)) {
-         /* At this late stage, anything that makes more constants will wreak
-          * havok on the vec4 backend.  The handling of constants in the vec4
-          * backend is not good.
-          */
-         if (is_scalar)
-            OPT(nir_opt_constant_folding);
-
+         OPT(nir_opt_constant_folding);
          OPT(nir_copy_prop);
          OPT(nir_opt_dce);
          OPT(nir_opt_cse);
@@ -1783,19 +1741,16 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 
    if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
       if (OPT(nir_lower_int64)) {
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);
       }
    }
 
    OPT(intel_nir_lower_conversions);
 
-   if (is_scalar)
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+   OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
    while (OPT(nir_opt_algebraic_distribute_src_mods)) {
-      if (is_scalar)
-         OPT(nir_opt_constant_folding);
-
+      OPT(nir_opt_constant_folding);
       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
       OPT(nir_opt_cse);
@@ -1821,7 +1776,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
       OPT(nir_lower_subgroups, &subgroups_options);
 
       if (OPT(nir_lower_int64))
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);
 
       divergence_analysis_dirty = true;
    }
@@ -1834,7 +1789,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
        * that must be lowered.
        */
       if (OPT(nir_lower_int64))
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);
 
       OPT(nir_lower_subgroups, &subgroups_options);
    }
@@ -1880,11 +1835,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 
    OPT(nir_convert_from_ssa, true);
 
-   if (!is_scalar) {
-      OPT(nir_move_vec_src_uses_to_dest, true);
-      OPT(nir_lower_vec_to_regs, NULL, NULL);
-   }
-
    OPT(nir_opt_dce);
 
    if (OPT(nir_opt_rematerialize_compares))
@@ -2035,8 +1985,7 @@ brw_nir_apply_key(nir_shader *nir,
       OPT(brw_nir_limit_trig_input_range_workaround);
 
    if (progress) {
-      const bool is_scalar = compiler->scalar_stage[nir->info.stage];
-      brw_nir_optimize(nir, is_scalar, compiler->devinfo);
+      brw_nir_optimize(nir, compiler->devinfo);
    }
 }
 
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 891d139cb3f..cf03b908b93 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -34,7 +34,6 @@ extern "C" {
 #endif
 
 extern const struct nir_shader_compiler_options brw_scalar_nir_options;
-extern const struct nir_shader_compiler_options brw_vector_nir_options;
 
 int type_size_vec4(const struct glsl_type *type, bool bindless);
 int type_size_dvec4(const struct glsl_type *type, bool bindless);
@@ -268,7 +267,7 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
                                 nir_shader *nir,
                                 struct brw_ubo_range out_ranges[4]);
 
-void brw_nir_optimize(nir_shader *nir, bool is_scalar,
+void brw_nir_optimize(nir_shader *nir,
                       const struct intel_device_info *devinfo);
 
 nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
diff --git a/src/intel/compiler/brw_nir_rt.c b/src/intel/compiler/brw_nir_rt.c
index b5daa1090de..81538732d71 100644
--- a/src/intel/compiler/brw_nir_rt.c
+++ b/src/intel/compiler/brw_nir_rt.c
@@ -529,8 +529,7 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
 
    NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);
 
-   const bool is_scalar = true;
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);
 
    return nir;
 }
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index f56ae8d68d1..2176c3d4912 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -26,7 +26,6 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 #include "brw_private.h"
-#include "brw_vec4_tes.h"
 #include "dev/intel_debug.h"
 #include "util/macros.h"
 #include "util/u_debug.h"
@@ -1310,9 +1309,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
    const struct intel_vue_map *input_vue_map = params->input_vue_map;
    struct brw_tes_prog_data *prog_data = params->prog_data;
 
-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES);
-   const unsigned *assembly;
 
    prog_data->base.base.stage = MESA_SHADER_TESS_EVAL;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
@@ -1395,55 +1392,35 @@ brw_compile_tes(const struct brw_compiler *compiler,
                         MESA_SHADER_TESS_EVAL);
    }
 
-   if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_tes()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
-
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
-      if (unlikely(debug_enabled)) {
-         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
-                                        "%s tessellation evaluation shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-      assembly = g.get_assembly();
-   } else {
-      brw::vec4_tes_visitor v(compiler, &params->base, key, prog_data,
-                              nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-	 return NULL;
-      }
-
-      if (unlikely(debug_enabled))
-	 v.dump_instructions();
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                            &prog_data->base, v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+   const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_tes()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
    }
 
-   return assembly;
+   assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s tessellation evaluation shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+   return g.get_assembly();
 }
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
index ce666e4ed3b..fbd50c07e7e 100644
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@@ -134,7 +134,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
                          gl_shader_stage stage)
 {
    const struct intel_device_info *devinfo = compiler->devinfo;
-   const bool is_scalar = compiler->scalar_stage[stage];
    nir_variable_mode indirect_mask = (nir_variable_mode) 0;
 
    switch (stage) {
@@ -143,19 +142,14 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
       indirect_mask |= nir_var_shader_in;
       break;
 
-   case MESA_SHADER_GEOMETRY:
-      if (!is_scalar)
-         indirect_mask |= nir_var_shader_in;
-      break;
-
    default:
       /* Everything else can handle indirect inputs */
       break;
    }
 
-   if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
-                    stage != MESA_SHADER_TASK &&
-                    stage != MESA_SHADER_MESH)
+   if (stage != MESA_SHADER_TESS_CTRL &&
+       stage != MESA_SHADER_TASK &&
+       stage != MESA_SHADER_MESH)
       indirect_mask |= nir_var_shader_out;
 
    /* On HSW+, we allow indirects in scalar shaders.  They get implemented
@@ -168,7 +162,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
     * indirects as scratch all the time, we may easily exceed this limit
     * without having any fallback.
     */
-   if (is_scalar && devinfo->verx10 <= 70)
+   if (devinfo->verx10 <= 70)
       indirect_mask |= nir_var_function_temp;
 
    return indirect_mask;
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 4390b45c3d4..ce483508884 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -468,10 +468,7 @@ get_features(const struct anv_physical_device *pdevice,
       .textureCompressionBC                     = true,
       .occlusionQueryPrecise                    = true,
       .pipelineStatisticsQuery                  = true,
-      /* We can't do image stores in vec4 shaders */
-      .vertexPipelineStoresAndAtomics =
-         pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
-         pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY],
+      .vertexPipelineStoresAndAtomics           = true,
       .fragmentStoresAndAtomics                 = true,
       .shaderTessellationAndGeometryPointSize   = true,
       .shaderImageGatherExtended                = true,
@@ -940,8 +937,7 @@ get_properties_1_1(const struct anv_physical_device *pdevice,
    p->subgroupSize = BRW_SUBGROUP_SIZE;
    VkShaderStageFlags scalar_stages = 0;
    for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
-      if (pdevice->compiler->scalar_stage[stage])
-         scalar_stages |= mesa_to_vk_shader_stage(stage);
+      scalar_stages |= mesa_to_vk_shader_stage(stage);
    }
    if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
       scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index b25e3c36c46..fc766e927e7 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -130,10 +130,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
    push_start = MIN2(push_start, push_end);
    push_start = ROUND_DOWN_TO(push_start, 32);
 
-   /* For vec4 our push data size needs to be aligned to a vec4 and for
-    * scalar, it needs to be aligned to a DWORD.
-    */
-   const unsigned alignment = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
+   /* For scalar, push data size needs to be aligned to a DWORD. */
+   const unsigned alignment = 4;
    nir->num_uniforms = ALIGN(push_end - push_start, alignment);
    prog_data->nr_params = nir->num_uniforms / 4;
    prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
@@ -218,13 +216,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
    if (push_ubo_ranges) {
       brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
 
-      /* The vec4 back-end pushes at most 32 regs while the scalar back-end
-       * pushes up to 64.  This is primarily because the scalar back-end has a
-       * massively more competent register allocator and so the risk of
-       * spilling due to UBO pushing isn't nearly as high.
-       */
-      const unsigned max_push_regs =
-         compiler->scalar_stage[nir->info.stage] ? 64 : 32;
+      const unsigned max_push_regs = 64;
 
       unsigned total_push_regs = push_constant_range.length;
       for (unsigned i = 0; i < 4; i++) {