intel/brw: Always use scalar shaders

Remove scalar_stage[] array, since now it is always scalar. This removes any usage of vec4 shaders in brw. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
2026-05-23 10:48:08 +02:00 · 2024-02-14 22:41:17 -08:00 · 2024-02-14 22:41:17 -08:00 · 7c23b90537
commit 7c23b90537
parent 303fd4e935
12 changed files with 224 additions and 523 deletions
--- a/src/intel/compiler/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw_compile_gs.cpp
@ -3,8 +3,6 @@
 * SPDX-License-Identifier: MIT
 */

-#include "brw_vec4_gs_visitor.h"
-#include "gfx6_gs_visitor.h"
 #include "brw_eu.h"
 #include "brw_fs.h"
 #include "brw_prim.h"
@ -41,7 +39,6 @@ brw_compile_gs(const struct brw_compiler *compiler,
   memset(&c, 0, sizeof(c));
   c.key = *key;

-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);

   prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
@ -266,135 +263,33 @@ brw_compile_gs(const struct brw_compiler *compiler,
      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
   }

-   if (is_scalar) {
-      fs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                   params->base.stats != NULL, debug_enabled);
-      if (v.run_gs()) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+   fs_visitor v(compiler, &params->base, &c, prog_data, nir,
+                params->base.stats != NULL, debug_enabled);
+   if (v.run_gs()) {
+      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;

-         assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-         prog_data->base.base.dispatch_grf_start_reg =
-            v.payload().num_regs / reg_unit(compiler->devinfo);
+      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+      prog_data->base.base.dispatch_grf_start_reg =
+         v.payload().num_regs / reg_unit(compiler->devinfo);

-         fs_generator g(compiler, &params->base,
-                        &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
-         if (unlikely(debug_enabled)) {
-            const char *label =
-               nir->info.label ? nir->info.label : "unnamed";
-            char *name = ralloc_asprintf(params->base.mem_ctx,
-                                         "%s geometry shader %s",
-                                         label, nir->info.name);
-            g.enable_debug(name);
-         }
-         g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
-                         v.performance_analysis.require(), params->base.stats);
-         g.add_const_data(nir->constant_data, nir->constant_data_size);
-         return g.get_assembly();
+      fs_generator g(compiler, &params->base,
+                     &prog_data->base.base, false, MESA_SHADER_GEOMETRY);
+      if (unlikely(debug_enabled)) {
+         const char *label =
+            nir->info.label ? nir->info.label : "unnamed";
+         char *name = ralloc_asprintf(params->base.mem_ctx,
+                                      "%s geometry shader %s",
+                                      label, nir->info.name);
+         g.enable_debug(name);
      }
-
-      params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-
-      return NULL;
+      g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
+                      v.performance_analysis.require(), params->base.stats);
+      g.add_const_data(nir->constant_data, nir->constant_data_size);
+      return g.get_assembly();
   }

-   if (compiler->devinfo->ver >= 7) {
-      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
-       * so without spilling. If the GS invocations count > 1, then we can't use
-       * dual object mode.
-       */
-      if (prog_data->invocations <= 1 &&
-          !INTEL_DEBUG(DEBUG_NO_DUAL_OBJECT_GS)) {
-         prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);

-         brw::vec4_gs_visitor v(compiler, &params->base, &c, prog_data, nir,
-                                true /* no_spills */,
-                                debug_enabled);
-
-         /* Backup 'nr_params' and 'param' as they can be modified by the
-          * the DUAL_OBJECT visitor. If it fails, we will run the fallback
-          * (DUAL_INSTANCED or SINGLE mode) and we need to restore original
-          * values.
-          */
-         const unsigned param_count = prog_data->base.base.nr_params;
-         uint32_t *param = ralloc_array(NULL, uint32_t, param_count);
-         memcpy(param, prog_data->base.base.param,
-                sizeof(uint32_t) * param_count);
-
-         if (v.run()) {
-            /* Success! Backup is not needed */
-            ralloc_free(param);
-            return brw_vec4_generate_assembly(compiler, &params->base,
-                                              nir, &prog_data->base,
-                                              v.cfg,
-                                              v.performance_analysis.require(),
-                                              debug_enabled);
-         } else {
-            /* These variables could be modified by the execution of the GS
-             * visitor if it packed the uniforms in the push constant buffer.
-             * As it failed, we need restore them so we can start again with
-             * DUAL_INSTANCED or SINGLE mode.
-             *
-             * FIXME: Could more variables be modified by this execution?
-             */
-            memcpy(prog_data->base.base.param, param,
-                   sizeof(uint32_t) * param_count);
-            prog_data->base.base.nr_params = param_count;
-            ralloc_free(param);
-         }
-      }
-   }
-
-   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
-    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
-    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
-    *
-    * FIXME: Single dispatch mode requires that the driver can handle
-    * interleaving of input registers, but this is already supported (dual
-    * instance mode has the same requirement). However, to take full advantage
-    * of single dispatch mode to reduce register pressure we would also need to
-    * do interleaved outputs, but currently, the vec4 visitor and generator
-    * classes do not support this, so at the moment register pressure in
-    * single and dual instance modes is the same.
-    *
-    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
-    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
-    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
-    * is also supported. When InstanceCount=1 (one instance per object) software
-    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
-    * the best choice for performance, followed by SINGLE mode."
-    *
-    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
-    * mode is more performant when invocations > 1. Gfx6 only supports
-    * SINGLE mode.
-    */
-   if (prog_data->invocations <= 1 || compiler->devinfo->ver < 7)
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X1_SINGLE;
-   else
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE;
-
-   brw::vec4_gs_visitor *gs = NULL;
-   const unsigned *ret = NULL;
-
-   if (compiler->devinfo->ver >= 7)
-      gs = new brw::vec4_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-   else
-      gs = new brw::gfx6_gs_visitor(compiler, &params->base, &c, prog_data,
-                                    nir, false /* no_spills */,
-                                    debug_enabled);
-
-   if (!gs->run()) {
-      params->base.error_str =
-         ralloc_strdup(params->base.mem_ctx, gs->fail_msg);
-   } else {
-      ret = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                       &prog_data->base, gs->cfg,
-                                       gs->performance_analysis.require(),
-                                       debug_enabled);
-   }
-
-   delete gs;
-   return ret;
+   return NULL;
 }

--- a/src/intel/compiler/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw_compile_tcs.cpp
@ -3,9 +3,9 @@
 * SPDX-License-Identifier: MIT
 */

+#include "brw_eu.h"
 #include "intel_nir.h"
 #include "brw_nir.h"
-#include "brw_vec4_tcs.h"
 #include "brw_fs.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
@ -49,9 +49,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
   struct brw_tcs_prog_data *prog_data = params->prog_data;
   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;

-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
-   const unsigned *assembly;

   vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
@ -89,7 +87,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
      prog_data->instances = nir->info.tess.tcs_vertices_out;
      prog_data->include_primitive_id = has_primitive_id;
   } else {
-      unsigned verts_per_thread = is_scalar ? 8 : 2;
+      unsigned verts_per_thread = 8;
      vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
      prog_data->instances =
         DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
@ -135,54 +133,33 @@ brw_compile_tcs(const struct brw_compiler *compiler,
      brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
   }

-   if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_tcs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
-      if (unlikely(debug_enabled)) {
-         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
-                                        "%s tessellation control shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-      assembly = g.get_assembly();
-   } else {
-      brw::vec4_tcs_visitor v(compiler, &params->base, key, prog_data,
-                              nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      if (INTEL_DEBUG(DEBUG_TCS))
-         v.dump_instructions();
-
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                            &prog_data->base, v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+   const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_tcs()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
   }

-   return assembly;
+   assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s tessellation control shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+   return g.get_assembly();
 }
--- a/src/intel/compiler/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw_compile_vs.cpp
@ -3,11 +3,9 @@
 * SPDX-License-Identifier: MIT
 */

-#include "brw_vec4.h"
 #include "brw_fs.h"
 #include "brw_eu.h"
 #include "brw_nir.h"
-#include "brw_vec4_vs.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"

@ -28,11 +26,8 @@ brw_compile_vs(const struct brw_compiler *compiler,
   prog_data->base.base.ray_queries = nir->info.ray_queries;
   prog_data->base.base.total_scratch = 0;

-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
   brw_nir_apply_key(nir, compiler, &key->base, 8);

-   const unsigned *assembly = NULL;
-
   prog_data->inputs_read = nir->info.inputs_read;
   prog_data->double_inputs_read = nir->info.vs.double_inputs;

@ -83,17 +78,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
          prog_data->uses_drawid = true;

-   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
-    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
-    * vec4 mode, the hardware appears to wedge unless we read something.
-    */
-   if (is_scalar)
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(nr_attribute_slots, 2);
-   else
-      prog_data->base.urb_read_length =
-         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
-
+   prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2);
   prog_data->nr_attribute_slots = nr_attribute_slots;

   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
@ -114,58 +99,37 @@ brw_compile_vs(const struct brw_compiler *compiler,
      brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
   }

-   if (is_scalar) {
-      const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+   const unsigned dispatch_width = compiler->devinfo->ver >= 20 ? 16 : 8;
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;

-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_vs()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg =
-         v.payload().num_regs / reg_unit(compiler->devinfo);
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, v.runtime_check_aads_emit,
-                     MESA_SHADER_VERTEX);
-      if (unlikely(debug_enabled)) {
-         const char *debug_name =
-            ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
-                            nir->info.label ? nir->info.label :
-                               "unnamed",
-                            nir->info.name);
-
-         g.enable_debug(debug_name);
-      }
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-      assembly = g.get_assembly();
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_vs()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
   }

-   if (!assembly) {
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
+   assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg =
+      v.payload().num_regs / reg_unit(compiler->devinfo);

-      vec4_vs_visitor v(compiler, &params->base, key, prog_data,
-                        nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, v.runtime_check_aads_emit,
+                  MESA_SHADER_VERTEX);
+   if (unlikely(debug_enabled)) {
+      const char *debug_name =
+         ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
+                         nir->info.label ? nir->info.label :
+                            "unnamed",
+                         nir->info.name);

-      assembly = brw_vec4_generate_assembly(compiler, &params->base,
-                                            nir, &prog_data->base,
-                                            v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+      g.enable_debug(debug_name);
   }
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+   g.add_const_data(nir->constant_data, nir->constant_data_size);

-   return assembly;
+   return g.get_assembly();
 }
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@ -29,77 +29,51 @@
 #include "compiler/nir/nir.h"
 #include "util/u_debug.h"

-#define COMMON_OPTIONS                                                        \
-   .has_uclz = true,                                                          \
-   .lower_fdiv = true,                                                        \
-   .lower_scmp = true,                                                        \
-   .lower_flrp16 = true,                                                      \
-   .lower_fmod = true,                                                        \
-   .lower_ufind_msb = true,                                                   \
-   .lower_uadd_carry = true,                                                  \
-   .lower_usub_borrow = true,                                                 \
-   .lower_flrp64 = true,                                                      \
-   .lower_fisnormal = true,                                                   \
-   .lower_isign = true,                                                       \
-   .lower_ldexp = true,                                                       \
-   .lower_bitfield_extract = true,                                            \
-   .lower_bitfield_insert = true,                                             \
-   .lower_device_index_to_zero = true,                                        \
-   .vectorize_io = true,                                                      \
-   .vectorize_tess_levels = true,                                             \
-   .use_interpolated_input_intrinsics = true,                                 \
-   .lower_insert_byte = true,                                                 \
-   .lower_insert_word = true,                                                 \
-   .vertex_id_zero_based = true,                                              \
-   .lower_base_vertex = true,                                                 \
-   .support_16bit_alu = true,                                                 \
-   .lower_uniforms_to_ubo = true
-
-#define COMMON_SCALAR_OPTIONS                                                 \
-   .lower_to_scalar = true,                                                   \
-   .lower_pack_half_2x16 = true,                                              \
-   .lower_pack_snorm_2x16 = true,                                             \
-   .lower_pack_snorm_4x8 = true,                                              \
-   .lower_pack_unorm_2x16 = true,                                             \
-   .lower_pack_unorm_4x8 = true,                                              \
-   .lower_unpack_half_2x16 = true,                                            \
-   .lower_unpack_snorm_2x16 = true,                                           \
-   .lower_unpack_snorm_4x8 = true,                                            \
-   .lower_unpack_unorm_2x16 = true,                                           \
-   .lower_unpack_unorm_4x8 = true,                                            \
-   .lower_hadd64 = true,                                                      \
-   .avoid_ternary_with_two_constants = true,                                  \
-   .has_pack_32_4x8 = true,                                                   \
-   .max_unroll_iterations = 32,                                               \
-   .force_indirect_unrolling = nir_var_function_temp,                         \
-   .divergence_analysis_options =                                             \
-      (nir_divergence_single_patch_per_tcs_subgroup |                         \
-       nir_divergence_single_patch_per_tes_subgroup |                         \
-       nir_divergence_shader_record_ptr_uniform)
-
 const struct nir_shader_compiler_options brw_scalar_nir_options = {
-   COMMON_OPTIONS,
-   COMMON_SCALAR_OPTIONS,
-};
-
-const struct nir_shader_compiler_options brw_vector_nir_options = {
-   COMMON_OPTIONS,
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all the
-    * components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    */
-   .fdot_replicates = true,
-
-   .lower_usub_sat = true,
+   .avoid_ternary_with_two_constants = true,
+   .divergence_analysis_options =
+      (nir_divergence_single_patch_per_tcs_subgroup |
+       nir_divergence_single_patch_per_tes_subgroup |
+       nir_divergence_shader_record_ptr_uniform),
+   .force_indirect_unrolling = nir_var_function_temp,
+   .has_pack_32_4x8 = true,
+   .has_uclz = true,
+   .lower_base_vertex = true,
+   .lower_bitfield_extract = true,
+   .lower_bitfield_insert = true,
+   .lower_device_index_to_zero = true,
+   .lower_fdiv = true,
+   .lower_fisnormal = true,
+   .lower_flrp16 = true,
+   .lower_flrp64 = true,
+   .lower_fmod = true,
+   .lower_hadd64 = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
+   .lower_isign = true,
+   .lower_ldexp = true,
+   .lower_pack_half_2x16 = true,
   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_scmp = true,
+   .lower_to_scalar = true,
+   .lower_uadd_carry = true,
+   .lower_ufind_msb = true,
+   .lower_uniforms_to_ubo = true,
+   .lower_unpack_half_2x16 = true,
   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
   .lower_unpack_unorm_2x16 = true,
-   .lower_extract_byte = true,
-   .lower_extract_word = true,
-   .intel_vec4 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .lower_usub_borrow = true,
   .max_unroll_iterations = 32,
+   .support_16bit_alu = true,
+   .use_interpolated_input_intrinsics = true,
+   .vectorize_io = true,
+   .vectorize_tess_levels = true,
+   .vertex_id_zero_based = true,
 };

 struct brw_compiler *
@ -129,15 +103,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
       devinfo->platform != INTEL_PLATFORM_ARL_H) ||
      debug_get_bool_option("INTEL_LOWER_DPAS", false);

-   /* There is no vec4 mode on Gfx10+, and we don't use it at all on Gfx8+. */
-   for (int i = MESA_SHADER_VERTEX; i < MESA_ALL_SHADER_STAGES; i++) {
-      compiler->scalar_stage[i] = devinfo->ver >= 8 ||
-         i == MESA_SHADER_FRAGMENT || i == MESA_SHADER_COMPUTE;
-   }
-
-   for (int i = MESA_SHADER_TASK; i < MESA_VULKAN_SHADER_STAGES; i++)
-      compiler->scalar_stage[i] = true;
-
   nir_lower_int64_options int64_options =
      nir_lower_imul64 |
      nir_lower_isign64 |
@ -175,13 +140,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
   for (int i = 0; i < MESA_ALL_SHADER_STAGES; i++) {
      struct nir_shader_compiler_options *nir_options =
         rzalloc(compiler, struct nir_shader_compiler_options);
-      bool is_scalar = compiler->scalar_stage[i];
-      if (is_scalar) {
-         *nir_options = brw_scalar_nir_options;
-         int64_options |= nir_lower_usub_sat64;
-      } else {
-         *nir_options = brw_vector_nir_options;
-      }
+      *nir_options = brw_scalar_nir_options;
+      int64_options |= nir_lower_usub_sat64;

      /* Prior to Gfx6, there are no three source operations, and Gfx11 loses
       * LRP.
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@ -86,7 +86,6 @@ struct brw_compiler {
   void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
   void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);

-   bool scalar_stage[MESA_ALL_SHADER_STAGES];
   bool use_tcs_multi_patch;
   struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];

--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@ -714,7 +714,7 @@ brw_nir_lower_fs_outputs(nir_shader *nir)
 })

 void
-brw_nir_optimize(nir_shader *nir, bool is_scalar,
+brw_nir_optimize(nir_shader *nir,
                 const struct intel_device_info *devinfo)
 {
   bool progress;
@ -752,18 +752,11 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
      OPT(nir_opt_ray_queries);
      OPT(nir_opt_ray_query_ranges);

-      if (is_scalar) {
-         OPT(nir_lower_alu_to_scalar, NULL, NULL);
-      } else {
-         OPT(nir_opt_shrink_stores, true);
-         OPT(nir_opt_shrink_vectors);
-      }
+      OPT(nir_lower_alu_to_scalar, NULL, NULL);

      OPT(nir_copy_prop);

-      if (is_scalar) {
-         OPT(nir_lower_phis_to_scalar, false);
-      }
+      OPT(nir_lower_phis_to_scalar, false);

      OPT(nir_copy_prop);
      OPT(nir_opt_dce);
@ -784,15 +777,9 @@ brw_nir_optimize(nir_shader *nir, bool is_scalar,
       * For indirect loads of uniforms (push constants), we assume that array
       * indices will nearly always be in bounds and the cost of the load is
       * low.  Therefore there shouldn't be a performance benefit to avoid it.
-       * However, in vec4 tessellation shaders, these loads operate by
-       * actually pulling from memory.
       */
-      const bool is_vec4_tessellation = !is_scalar &&
-         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
-          nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
-          devinfo->ver >= 6);
+      OPT(nir_opt_peephole_select, 0, true, false);
+      OPT(nir_opt_peephole_select, 8, true, devinfo->ver >= 6);

      OPT(nir_opt_intrinsics);
      OPT(nir_opt_idiv_const, 32);
@ -1014,15 +1001,11 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
   const struct intel_device_info *devinfo = compiler->devinfo;
   UNUSED bool progress; /* Written by OPT */

-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];
-
   nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");

   OPT(nir_lower_frexp);

-   if (is_scalar) {
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
-   }
+   OPT(nir_lower_alu_to_scalar, NULL, NULL);

   if (nir->info.stage == MESA_SHADER_GEOMETRY)
      OPT(nir_lower_gs_intrinsics, 0);
@ -1081,7 +1064,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
   OPT(nir_split_var_copies);
   OPT(nir_split_struct_vars, nir_var_function_temp);

-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);

   OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
   if (OPT(nir_lower_int64_float_conversions)) {
@ -1102,9 +1085,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
      OPT(nir_opt_large_constants, NULL, 32);
   }

-   if (is_scalar) {
-      OPT(nir_lower_load_const_to_scalar);
-   }
+   OPT(nir_lower_load_const_to_scalar);

   OPT(nir_lower_system_values);
   nir_lower_compute_system_values_options lower_csv_options = {
@ -1116,7 +1097,6 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
      .ballot_bit_size = 32,
      .ballot_components = 1,
      .lower_to_scalar = true,
-      .lower_vote_trivial = !is_scalar,
      .lower_relative_shuffle = true,
      .lower_quad_broadcast_dynamic = true,
      .lower_elect = true,
@ -1142,7 +1122,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
    * issues are helped but nothing else in shader-db is hurt except for maybe
    * that one kerbal space program shader.
    */
-   if (is_scalar && !(indirect_mask & nir_var_function_temp))
+   if (!(indirect_mask & nir_var_function_temp))
      OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);

   /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
@ -1165,7 +1145,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
      OPT(intel_nir_clamp_per_vertex_loads);

   /* Get rid of split copies */
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);
 }

 static bool
@ -1321,18 +1301,13 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
   nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
   nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");

-   const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
-   const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
-
-   if (p_is_scalar && c_is_scalar) {
-      NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
-      NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
-      brw_nir_optimize(producer, p_is_scalar, devinfo);
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
-   }
+   NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
+   NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
+   brw_nir_optimize(producer, devinfo);
+   brw_nir_optimize(consumer, devinfo);

   if (nir_link_opt_varyings(producer, consumer))
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+      brw_nir_optimize(consumer, devinfo);

   NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
   NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
@ -1361,8 +1336,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
                  brw_nir_no_indirect_mask(compiler, consumer->info.stage),
                  UINT32_MAX);

-      brw_nir_optimize(producer, p_is_scalar, devinfo);
-      brw_nir_optimize(consumer, c_is_scalar, devinfo);
+      brw_nir_optimize(producer, devinfo);
+      brw_nir_optimize(consumer, devinfo);

      if (producer->info.stage == MESA_SHADER_MESH &&
            consumer->info.stage == MESA_SHADER_FRAGMENT) {
@ -1591,48 +1566,45 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
                               enum brw_robustness_flags robust_flags)
 {
   bool progress = false;
-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];

-   if (is_scalar) {
-      nir_load_store_vectorize_options options = {
-         .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
-                  nir_var_mem_global | nir_var_mem_shared |
-                  nir_var_mem_task_payload,
-         .callback = brw_nir_should_vectorize_mem,
-         .robust_modes = (nir_variable_mode)0,
-      };
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
+               nir_var_mem_global | nir_var_mem_shared |
+               nir_var_mem_task_payload,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };

-      if (robust_flags & BRW_ROBUSTNESS_UBO)
-         options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
-      if (robust_flags & BRW_ROBUSTNESS_SSBO)
-         options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;
+   if (robust_flags & BRW_ROBUSTNESS_UBO)
+      options.robust_modes |= nir_var_mem_ubo | nir_var_mem_global;
+   if (robust_flags & BRW_ROBUSTNESS_SSBO)
+      options.robust_modes |= nir_var_mem_ssbo | nir_var_mem_global;

-      OPT(nir_opt_load_store_vectorize, &options);
+   OPT(nir_opt_load_store_vectorize, &options);

-      /* Only run the blockify optimization on Gfx9+ because although prior HW
-       * versions have support for block loads, they do have limitations on
-       * alignment as well as requiring split sends which are not supported
-       * there.
+   /* Only run the blockify optimization on Gfx9+ because although prior HW
+    * versions have support for block loads, they do have limitations on
+    * alignment as well as requiring split sends which are not supported
+    * there.
+    */
+   if (compiler->devinfo->ver >= 9) {
+      /* Required for nir_divergence_analysis() */
+      OPT(nir_convert_to_lcssa, true, true);
+
+      /* When HW supports block loads, using the divergence analysis, try
+       * to find uniform SSBO loads and turn them into block loads.
+       *
+       * Rerun the vectorizer after that to make the largest possible block
+       * loads.
+       *
+       * This is a win on 2 fronts :
+       *   - fewer send messages
+       *   - reduced register pressure
       */
-      if (compiler->devinfo->ver >= 9) {
-         /* Required for nir_divergence_analysis() */
-         OPT(nir_convert_to_lcssa, true, true);
-
-         /* When HW supports block loads, using the divergence analysis, try
-          * to find uniform SSBO loads and turn them into block loads.
-          *
-          * Rerun the vectorizer after that to make the largest possible block
-          * loads.
-          *
-          * This is a win on 2 fronts :
-          *   - fewer send messages
-          *   - reduced register pressure
-          */
-         nir_divergence_analysis(nir);
-         if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
-            OPT(nir_opt_load_store_vectorize, &options);
-         OPT(nir_opt_remove_phis);
-      }
+      nir_divergence_analysis(nir);
+      if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo))
+         OPT(nir_opt_load_store_vectorize, &options);
+      OPT(nir_opt_remove_phis);
   }

   nir_lower_mem_access_bit_sizes_options mem_access_options = {
@ -1683,7 +1655,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
                    enum brw_robustness_flags robust_flags)
 {
   const struct intel_device_info *devinfo = compiler->devinfo;
-   const bool is_scalar = compiler->scalar_stage[nir->info.stage];

   UNUSED bool progress; /* Written by OPT */

@ -1710,20 +1681,20 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
   if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
      NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);

-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);

-   if (is_scalar && nir_shader_has_local_variables(nir)) {
+   if (nir_shader_has_local_variables(nir)) {
      OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
          glsl_get_natural_size_align_bytes);
      OPT(nir_lower_explicit_io, nir_var_function_temp,
          nir_address_format_32bit_offset);
-      brw_nir_optimize(nir, is_scalar, devinfo);
+      brw_nir_optimize(nir, devinfo);
   }

   brw_vectorize_lower_mem_access(nir, compiler, robust_flags);

   if (OPT(nir_lower_int64))
-      brw_nir_optimize(nir, is_scalar, devinfo);
+      brw_nir_optimize(nir, devinfo);

   if (devinfo->ver >= 6) {
      /* Try and fuse multiply-adds, if successful, run shrink_vectors to
@ -1741,8 +1712,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
         OPT(nir_opt_shrink_vectors);
   }

-   if (is_scalar)
-      OPT(intel_nir_opt_peephole_imul32x16);
+   OPT(intel_nir_opt_peephole_imul32x16);

   if (OPT(nir_opt_comparison_pre)) {
      OPT(nir_copy_prop);
@ -1753,27 +1723,15 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
       * the other optimization passes) will have removed at least one
       * instruction from one of the branches of the if-statement, so now it
       * might be under the threshold of conversion to bcsel.
-       *
-       * See brw_nir_optimize for the explanation of is_vec4_tessellation.
       */
-      const bool is_vec4_tessellation = !is_scalar &&
-         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
-          nir->info.stage == MESA_SHADER_TESS_EVAL);
-      OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
-      OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
-          compiler->devinfo->ver >= 6);
+      OPT(nir_opt_peephole_select, 0, false, false);
+      OPT(nir_opt_peephole_select, 1, false, compiler->devinfo->ver >= 6);
   }

   do {
      progress = false;
      if (OPT(nir_opt_algebraic_late)) {
-         /* At this late stage, anything that makes more constants will wreak
-          * havok on the vec4 backend.  The handling of constants in the vec4
-          * backend is not good.
-          */
-         if (is_scalar)
-            OPT(nir_opt_constant_folding);
-
+         OPT(nir_opt_constant_folding);
         OPT(nir_copy_prop);
         OPT(nir_opt_dce);
         OPT(nir_opt_cse);
@ -1783,19 +1741,16 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,

   if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
      if (OPT(nir_lower_int64)) {
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);
      }
   }

   OPT(intel_nir_lower_conversions);

-   if (is_scalar)
-      OPT(nir_lower_alu_to_scalar, NULL, NULL);
+   OPT(nir_lower_alu_to_scalar, NULL, NULL);

   while (OPT(nir_opt_algebraic_distribute_src_mods)) {
-      if (is_scalar)
-         OPT(nir_opt_constant_folding);
-
+      OPT(nir_opt_constant_folding);
      OPT(nir_copy_prop);
      OPT(nir_opt_dce);
      OPT(nir_opt_cse);
@ -1821,7 +1776,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
      OPT(nir_lower_subgroups, &subgroups_options);

      if (OPT(nir_lower_int64))
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);

      divergence_analysis_dirty = true;
   }
@ -1834,7 +1789,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
       * that must be lowered.
       */
      if (OPT(nir_lower_int64))
-         brw_nir_optimize(nir, is_scalar, devinfo);
+         brw_nir_optimize(nir, devinfo);

      OPT(nir_lower_subgroups, &subgroups_options);
   }
@ -1880,11 +1835,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,

   OPT(nir_convert_from_ssa, true);

-   if (!is_scalar) {
-      OPT(nir_move_vec_src_uses_to_dest, true);
-      OPT(nir_lower_vec_to_regs, NULL, NULL);
-   }
-
   OPT(nir_opt_dce);

   if (OPT(nir_opt_rematerialize_compares))
@ -2035,8 +1985,7 @@ brw_nir_apply_key(nir_shader *nir,
      OPT(brw_nir_limit_trig_input_range_workaround);

   if (progress) {
-      const bool is_scalar = compiler->scalar_stage[nir->info.stage];
-      brw_nir_optimize(nir, is_scalar, compiler->devinfo);
+      brw_nir_optimize(nir, compiler->devinfo);
   }
 }

--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@ -34,7 +34,6 @@ extern "C" {
 #endif

 extern const struct nir_shader_compiler_options brw_scalar_nir_options;
-extern const struct nir_shader_compiler_options brw_vector_nir_options;

 int type_size_vec4(const struct glsl_type *type, bool bindless);
 int type_size_dvec4(const struct glsl_type *type, bool bindless);
@ -268,7 +267,7 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
                                nir_shader *nir,
                                struct brw_ubo_range out_ranges[4]);

-void brw_nir_optimize(nir_shader *nir, bool is_scalar,
+void brw_nir_optimize(nir_shader *nir,
                      const struct intel_device_info *devinfo);

 nir_shader *brw_nir_create_passthrough_tcs(void *mem_ctx,
--- a/src/intel/compiler/brw_nir_rt.c
+++ b/src/intel/compiler/brw_nir_rt.c
@ -529,8 +529,7 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,

   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, NULL);

-   const bool is_scalar = true;
-   brw_nir_optimize(nir, is_scalar, devinfo);
+   brw_nir_optimize(nir, devinfo);

   return nir;
 }
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@ -26,7 +26,6 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 #include "brw_private.h"
-#include "brw_vec4_tes.h"
 #include "dev/intel_debug.h"
 #include "util/macros.h"
 #include "util/u_debug.h"
@ -1310,9 +1309,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
   const struct intel_vue_map *input_vue_map = params->input_vue_map;
   struct brw_tes_prog_data *prog_data = params->prog_data;

-   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
   const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TES);
-   const unsigned *assembly;

   prog_data->base.base.stage = MESA_SHADER_TESS_EVAL;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
@ -1395,55 +1392,35 @@ brw_compile_tes(const struct brw_compiler *compiler,
                        MESA_SHADER_TESS_EVAL);
   }

-   if (is_scalar) {
-      const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
-      fs_visitor v(compiler, &params->base, &key->base,
-                   &prog_data->base.base, nir, dispatch_width,
-                   params->base.stats != NULL, debug_enabled);
-      if (!v.run_tes()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      assert(v.payload().num_regs % reg_unit(devinfo) == 0);
-      prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
-
-      prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
-
-      fs_generator g(compiler, &params->base,
-                     &prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
-      if (unlikely(debug_enabled)) {
-         g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
-                                        "%s tessellation evaluation shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, dispatch_width, v.shader_stats,
-                      v.performance_analysis.require(), params->base.stats);
-
-      g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-      assembly = g.get_assembly();
-   } else {
-      brw::vec4_tes_visitor v(compiler, &params->base, key, prog_data,
-                              nir, debug_enabled);
-      if (!v.run()) {
-         params->base.error_str =
-            ralloc_strdup(params->base.mem_ctx, v.fail_msg);
-	 return NULL;
-      }
-
-      if (unlikely(debug_enabled))
-	 v.dump_instructions();
-
-      assembly = brw_vec4_generate_assembly(compiler, &params->base, nir,
-                                            &prog_data->base, v.cfg,
-                                            v.performance_analysis.require(),
-                                            debug_enabled);
+   const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
+   fs_visitor v(compiler, &params->base, &key->base,
+                &prog_data->base.base, nir, dispatch_width,
+                params->base.stats != NULL, debug_enabled);
+   if (!v.run_tes()) {
+      params->base.error_str =
+         ralloc_strdup(params->base.mem_ctx, v.fail_msg);
+      return NULL;
   }

-   return assembly;
+   assert(v.payload().num_regs % reg_unit(devinfo) == 0);
+   prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
+
+   prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
+
+   fs_generator g(compiler, &params->base,
+                  &prog_data->base.base, false, MESA_SHADER_TESS_EVAL);
+   if (unlikely(debug_enabled)) {
+      g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
+                                     "%s tessellation evaluation shader %s",
+                                     nir->info.label ? nir->info.label
+                                                     : "unnamed",
+                                     nir->info.name));
+   }
+
+   g.generate_code(v.cfg, dispatch_width, v.shader_stats,
+                   v.performance_analysis.require(), params->base.stats);
+
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
+
+   return g.get_assembly();
 }
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@ -134,7 +134,6 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
                         gl_shader_stage stage)
 {
   const struct intel_device_info *devinfo = compiler->devinfo;
-   const bool is_scalar = compiler->scalar_stage[stage];
   nir_variable_mode indirect_mask = (nir_variable_mode) 0;

   switch (stage) {
@ -143,19 +142,14 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
      indirect_mask |= nir_var_shader_in;
      break;

-   case MESA_SHADER_GEOMETRY:
-      if (!is_scalar)
-         indirect_mask |= nir_var_shader_in;
-      break;
-
   default:
      /* Everything else can handle indirect inputs */
      break;
   }

-   if (is_scalar && stage != MESA_SHADER_TESS_CTRL &&
-                    stage != MESA_SHADER_TASK &&
-                    stage != MESA_SHADER_MESH)
+   if (stage != MESA_SHADER_TESS_CTRL &&
+       stage != MESA_SHADER_TASK &&
+       stage != MESA_SHADER_MESH)
      indirect_mask |= nir_var_shader_out;

   /* On HSW+, we allow indirects in scalar shaders.  They get implemented
@ -168,7 +162,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
    * indirects as scratch all the time, we may easily exceed this limit
    * without having any fallback.
    */
-   if (is_scalar && devinfo->verx10 <= 70)
+   if (devinfo->verx10 <= 70)
      indirect_mask |= nir_var_function_temp;

   return indirect_mask;
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -468,10 +468,7 @@ get_features(const struct anv_physical_device *pdevice,
      .textureCompressionBC                     = true,
      .occlusionQueryPrecise                    = true,
      .pipelineStatisticsQuery                  = true,
-      /* We can't do image stores in vec4 shaders */
-      .vertexPipelineStoresAndAtomics =
-         pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
-         pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY],
+      .vertexPipelineStoresAndAtomics           = true,
      .fragmentStoresAndAtomics                 = true,
      .shaderTessellationAndGeometryPointSize   = true,
      .shaderImageGatherExtended                = true,
@ -940,8 +937,7 @@ get_properties_1_1(const struct anv_physical_device *pdevice,
   p->subgroupSize = BRW_SUBGROUP_SIZE;
   VkShaderStageFlags scalar_stages = 0;
   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
-      if (pdevice->compiler->scalar_stage[stage])
-         scalar_stages |= mesa_to_vk_shader_stage(stage);
+      scalar_stages |= mesa_to_vk_shader_stage(stage);
   }
   if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
      scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@ -130,10 +130,8 @@ anv_nir_compute_push_layout(nir_shader *nir,
   push_start = MIN2(push_start, push_end);
   push_start = ROUND_DOWN_TO(push_start, 32);

-   /* For vec4 our push data size needs to be aligned to a vec4 and for
-    * scalar, it needs to be aligned to a DWORD.
-    */
-   const unsigned alignment = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
+   /* For scalar, push data size needs to be aligned to a DWORD. */
+   const unsigned alignment = 4;
   nir->num_uniforms = ALIGN(push_end - push_start, alignment);
   prog_data->nr_params = nir->num_uniforms / 4;
   prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
@ -218,13 +216,7 @@ anv_nir_compute_push_layout(nir_shader *nir,
   if (push_ubo_ranges) {
      brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);

-      /* The vec4 back-end pushes at most 32 regs while the scalar back-end
-       * pushes up to 64.  This is primarily because the scalar back-end has a
-       * massively more competent register allocator and so the risk of
-       * spilling due to UBO pushing isn't nearly as high.
-       */
-      const unsigned max_push_regs =
-         compiler->scalar_stage[nir->info.stage] ? 64 : 32;
+      const unsigned max_push_regs = 64;

      unsigned total_push_regs = push_constant_range.length;
      for (unsigned i = 0; i < 4; i++) {