diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index 926c7518c13..c41a9f693f1 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -1015,34 +1015,6 @@ bi_is_zs(unsigned location)
    return location == FRAG_RESULT_DEPTH || location == FRAG_RESULT_STENCIL;
 }
 
-static bool
-bifrost_nir_lower_shader_output_impl(struct nir_builder *b,
-                                     nir_intrinsic_instr *intr, void *data)
-{
-   if (intr->intrinsic != nir_intrinsic_store_output &&
-       intr->intrinsic != nir_intrinsic_store_per_view_output)
-      return false;
-
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-   unsigned mask = va_shader_output_from_loc(sem.location);
-
-   b->cursor = nir_instr_remove(&intr->instr);
-   nir_def *shader_output = nir_load_shader_output_pan(b);
-
-   nir_push_if(b, nir_i2b(b, nir_iand_imm(b, shader_output, mask)));
-   nir_builder_instr_insert(b, &intr->instr);
-   nir_pop_if(b, NULL);
-   return true;
-}
-
-static bool
-bifrost_nir_lower_shader_output(nir_shader *shader)
-{
-   return nir_shader_intrinsics_pass(shader,
-                                     bifrost_nir_lower_shader_output_impl,
-                                     nir_metadata_none, NULL);
-}
-
 /* Atomics and memory write on the vertex stage have implementation-defined
  * behaviors on how many invocations will happen. However for some reasons,
  * atomic counters on GL/GLES specs are quite ambigous here and even have tests
@@ -6133,13 +6105,6 @@ bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id)
       NIR_PASS(_, nir, nir_lower_viewport_transform);
       NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
       NIR_PASS(_, nir, pan_nir_lower_noperspective_vs);
-
-      /* nir_lower[_explicit]_io is lazy and emits mul+add chains even
-       * for offsets it could figure out are constant.  Do some
-       * constant folding before pan_nir_lower_store_component below.
-       */
-      NIR_PASS(_, nir, nir_opt_constant_folding);
-      NIR_PASS(_, nir, pan_nir_lower_store_component);
    }
 
    nir_lower_mem_access_bit_sizes_options mem_size_options = {
@@ -6853,26 +6818,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
       if (info->vs.idvs && nir->info.writes_memory)
          NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics);
 
-      if (info->vs.idvs) {
-         bool shader_output_pass = false;
-         NIR_PASS(shader_output_pass, nir, bifrost_nir_lower_shader_output);
-
-         /* If shader output lower made progress, ensure to merge adjacent if that were added */
-         if (shader_output_pass) {
-            /* First we clean up and deduplicate added condition logic */
-            NIR_PASS(_, nir, nir_opt_copy_prop);
-            NIR_PASS(_, nir, nir_opt_dce);
-            NIR_PASS(_, nir, nir_opt_cse);
-
-            /* We then move vec closer to their I/O stores to ensure nothing is between ifs */
-            NIR_PASS(_, nir, nir_opt_sink, nir_move_copies);
-            NIR_PASS(_, nir, nir_opt_move, nir_move_copies);
-
-            /* Finally run if optimization to ensure trivial if blocks are merged together */
-            NIR_PASS(_, nir, nir_opt_if, 0);
-         }
-      }
-
       bool has_extended_fifo = false;
       if (pan_arch(inputs->gpu_id) >= 9) {
          const uint64_t outputs = nir->info.outputs_written;
diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build
index 6c9bb5eeef1..83a94dbde04 100644
--- a/src/panfrost/compiler/meson.build
+++ b/src/panfrost/compiler/meson.build
@@ -15,7 +15,6 @@ libpanfrost_compiler_files = files(
   'pan_nir_lower_image_ms.c',
   'pan_nir_lower_noperspective.c',
   'pan_nir_lower_sample_position.c',
-  'pan_nir_lower_store_component.c',
   'pan_nir_lower_texel_buffer_index.c',
   'pan_nir_lower_vertex_id.c',
   'pan_nir_lower_vs_outputs.c',
@@ -29,8 +28,8 @@ subdir('midgard')
 libpanfrost_compiler = static_library(
   'panfrost_compiler',
   [libpanfrost_compiler_files],
-  include_directories : [inc_include, inc_src],
-  dependencies: [idep_nir, idep_mesautil],
+  include_directories : [inc_include, inc_src, inc_valhall],
+  dependencies: [idep_nir, idep_mesautil, idep_valhall_enums_h, idep_bi_opcodes_h],
   link_with: [libpanfrost_midgard, libpanfrost_bifrost],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
diff --git a/src/panfrost/compiler/midgard/midgard_compile.c b/src/panfrost/compiler/midgard/midgard_compile.c
index ae4cd952c3c..b8a38a7bf35 100644
--- a/src/panfrost/compiler/midgard/midgard_compile.c
+++ b/src/panfrost/compiler/midgard/midgard_compile.c
@@ -388,13 +388,6 @@ midgard_postprocess_nir(nir_shader *nir, UNUSED unsigned gpu_id)
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       NIR_PASS(_, nir, nir_lower_viewport_transform);
       NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0);
-
-      /* nir_lower[_explicit]_io is lazy and emits mul+add chains even
-       * for offsets it could figure out are constant.  Do some
-       * constant folding before pan_nir_lower_store_component below.
-       */
-      NIR_PASS(_, nir, nir_opt_constant_folding);
-      NIR_PASS(_, nir, pan_nir_lower_store_component);
    }
 
    /* Could be eventually useful for Vulkan, but we don't expect it to have
@@ -2989,6 +2982,12 @@ midgard_compile_shader_nir(nir_shader *nir,
          pan_nir_collect_noperspective_varyings_fs(nir);
    }
 
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id,
+               inputs->varying_layout, false /* has_idvs */,
+               false /* has_extended_fifo */);
+   }
+
    /* Optimisation passes */
    optimise_nir(nir, ctx->quirks, inputs->is_blend);
 
diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h
index 1dec76e6d51..1c34afc6edc 100644
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@@ -47,8 +47,6 @@ pan_nir_tile_default_coverage(nir_builder *b)
 
 bool pan_nir_lower_bool_to_bitsize(nir_shader *shader);
 
-bool pan_nir_lower_store_component(nir_shader *shader);
-
 bool pan_nir_lower_vertex_id(nir_shader *shader);
 
 bool pan_nir_lower_image_ms(nir_shader *shader);
diff --git a/src/panfrost/compiler/pan_nir_lower_store_component.c b/src/panfrost/compiler/pan_nir_lower_store_component.c
deleted file mode 100644
index e0bc7ef06e7..00000000000
--- a/src/panfrost/compiler/pan_nir_lower_store_component.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2020-2022 Collabora Ltd.
- * SPDX-License-Identifier: MIT
- */
-
-#include "compiler/nir/nir_builder.h"
-#include "pan_nir.h"
-
-/*
- * If the shader packs multiple varyings into the same location with different
- * location_frac, we'll need to lower to a single varying store that collects
- * all of the channels together. This is because the varying instruction on
- * Midgard and Bifrost is slot-based, writing out an entire vec4 slot at a time.
- *
- * NOTE: this expects all stores to be outside of control flow, and with
- * constant offsets. It should be run after nir_lower_io_vars_to_temporaries.
- */
-static bool
-lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   if (intr->intrinsic != nir_intrinsic_store_output &&
-       intr->intrinsic != nir_intrinsic_store_per_view_output)
-      return false;
-
-   struct hash_table_u64 *slots = data;
-   unsigned component = nir_intrinsic_component(intr);
-   nir_src *slot_src = nir_get_io_offset_src(intr);
-   uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
-
-   if (intr->intrinsic == nir_intrinsic_store_per_view_output) {
-      uint64_t view_index = nir_src_as_uint(intr->src[1]);
-      slot |= view_index << 32;
-   }
-
-   nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
-   unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
-
-   nir_def *value = intr->src[0].ssa;
-   b->cursor = nir_before_instr(&intr->instr);
-
-   nir_def *undef = nir_undef(b, 1, value->bit_size);
-   nir_def *channels[4] = {undef, undef, undef, undef};
-
-   /* Copy old */
-   u_foreach_bit(i, mask) {
-      assert(prev != NULL);
-      nir_def *prev_ssa = prev->src[0].ssa;
-      channels[i] = nir_channel(b, prev_ssa, i);
-   }
-
-   /* Copy new */
-   unsigned new_mask = nir_intrinsic_write_mask(intr);
-   mask |= (new_mask << component);
-
-   u_foreach_bit(i, new_mask) {
-      assert(component + i < 4);
-      channels[component + i] = nir_channel(b, value, i);
-   }
-
-   intr->num_components = util_last_bit(mask);
-   nir_src_rewrite(&intr->src[0], nir_vec(b, channels, intr->num_components));
-
-   nir_intrinsic_set_component(intr, 0);
-   nir_intrinsic_set_write_mask(intr, mask);
-
-   if (prev) {
-      _mesa_hash_table_u64_remove(slots, slot);
-      nir_instr_remove(&prev->instr);
-   }
-
-   _mesa_hash_table_u64_insert(slots, slot, intr);
-   return true;
-}
-
-bool
-pan_nir_lower_store_component(nir_shader *s)
-{
-   assert(s->info.stage == MESA_SHADER_VERTEX);
-
-   struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL);
-   bool progress = nir_shader_intrinsics_pass(
-      s, lower_store_component,
-      nir_metadata_control_flow, stores);
-   _mesa_hash_table_u64_destroy(stores);
-   return progress;
-}
diff --git a/src/panfrost/compiler/pan_nir_lower_vs_outputs.c b/src/panfrost/compiler/pan_nir_lower_vs_outputs.c
index ec07faf3f82..01ae096bbe3 100644
--- a/src/panfrost/compiler/pan_nir_lower_vs_outputs.c
+++ b/src/panfrost/compiler/pan_nir_lower_vs_outputs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2025 Collabora, Ltd.
+ * Copyright (C) 2025-2026 Collabora, Ltd.
  * SPDX-License-Identifier: MIT
  */
 
@@ -7,21 +7,30 @@
 #include "nir_builder.h"
 
 #include "panfrost/model/pan_model.h"
+#include "bifrost/valhall/valhall.h"
+#include "util/bitscan.h"
 
 struct lower_vs_outputs_ctx {
    unsigned arch;
    const struct pan_varying_layout *varying_layout;
    bool has_idvs;
    bool has_extended_fifo;
+
+   nir_variable *variables[PAN_MAX_VARYINGS];
+   uint8_t per_view_written[PAN_MAX_VARYINGS];
+   unsigned used_buckets;
 };
 
 static void
-build_attr_buf_write(struct nir_builder *b, nir_def *data,
-                     const struct pan_varying_slot *slot, uint32_t view_index,
+build_attr_buf_write(struct nir_builder *b, nir_def *data, uint32_t idx,
+                     uint32_t view_index,
                      const struct lower_vs_outputs_ctx *ctx)
 {
    /* We need the precise memory layout */
    pan_varying_layout_require_layout(ctx->varying_layout);
+   const struct pan_varying_slot *slot =
+      pan_varying_layout_slot_at(ctx->varying_layout, idx);
+   assert(slot);
 
    nir_def *index = nir_load_idvs_output_buf_index_pan(b);
 
@@ -78,61 +87,146 @@ build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base,
    nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type);
 }
 
-static bool
-lower_vs_output_store(struct nir_builder *b,
-                      nir_intrinsic_instr *store, void *cb_data)
+static void
+build_store_output(struct nir_builder *b, nir_def *data, uint32_t idx,
+                   nir_alu_type src_type,
+                   const struct lower_vs_outputs_ctx *ctx)
 {
-   const struct lower_vs_outputs_ctx *ctx = cb_data;
+   pan_varying_layout_require_format(ctx->varying_layout);
 
-   if (store->intrinsic != nir_intrinsic_store_output &&
-       store->intrinsic != nir_intrinsic_store_per_view_output)
-      return false;
+   const struct pan_varying_slot *slot =
+      pan_varying_layout_slot_at(ctx->varying_layout, idx);
+   assert(slot);
 
-   b->cursor = nir_instr_remove(&store->instr);
+   nir_io_semantics sem = {
+      .location = slot->location,
+      .num_slots = 1,
+   };
 
-   nir_io_semantics sem = nir_intrinsic_io_semantics(store);
-   nir_alu_type src_type = nir_intrinsic_src_type(store);
-   unsigned src_bit_size = nir_alu_type_get_type_size(src_type);
+   nir_store_output(b, data, nir_imm_int(b, 0), .base = idx, .range = 1,
+                    .write_mask = BITFIELD_MASK(slot->ncomps), .component = 0,
+                    .src_type = src_type, .io_semantics = sem);
+}
+
+static void
+lower_vs_output_store(struct nir_builder *b,
+                      unsigned view_index,
+                      unsigned idx, const struct lower_vs_outputs_ctx *ctx)
+{
+   nir_variable *var = ctx->variables[idx];
+   assert(var != NULL);
+
+   nir_alu_type src_type =
+      nir_get_nir_type_for_glsl_type(glsl_without_array(var->type));
+
+   bool is_per_view = glsl_type_is_array(var->type);
+   assert(is_per_view || view_index == 0);
+   nir_def *data = is_per_view ? nir_load_array_var_imm(b, var, view_index)
+                               : nir_load_var(b, var);
+
+   if (ctx->arch >= 9 && ctx->has_idvs) {
+      build_attr_buf_write(b, data, idx, view_index, ctx);
+   } else if (ctx->arch >= 6) {
+      assert(!is_per_view);
+      build_attr_desc_write(b, data, idx, src_type, ctx);
+   } else {
+      assert(!is_per_view);
+      build_store_output(b, data, idx, src_type, ctx);
+   }
+}
+
+static nir_variable *
+get_or_create_var(nir_builder *b, struct lower_vs_outputs_ctx *ctx,
+                  nir_intrinsic_instr *intr)
+{
+   assert(intr->intrinsic == nir_intrinsic_store_output ||
+          intr->intrinsic == nir_intrinsic_store_per_view_output);
+   bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output;
+   ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   unsigned slot_idx = nir_intrinsic_base(intr);
+   nir_alu_type src_type = nir_intrinsic_src_type(intr);
+   enum glsl_base_type base_type = nir_get_glsl_base_type_for_nir_type(src_type);
 
    /* Indirect array varyings are not yet supported (num_slots > 1) */
    assert(sem.num_slots == 1);
-   assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0);
+   assert(nir_src_as_uint(*nir_get_io_offset_src(intr)) == 0);
 
-   /* We need the slot section for cache hints */
+   nir_variable *var = ctx->variables[slot_idx];
+   if (var != NULL) {
+      /* All stores should agree per-location */
+      assert(glsl_type_is_array(var->type) == is_per_view);
+      assert(glsl_get_base_type(glsl_without_array(var->type)) == base_type);
+      return var;
+   }
+
+   /* We need the slot section for the number of components */
    pan_varying_layout_require_format(ctx->varying_layout);
    const struct pan_varying_slot *slot =
-      pan_varying_layout_find_slot(ctx->varying_layout, sem.location);
+      pan_varying_layout_slot_at(ctx->varying_layout, slot_idx);
    /* Special slots are read only */
-   assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL);
-   /* From v9, IO is resized to the real size of the slot */
-   assert(ctx->arch < 9 ||
-          src_bit_size == nir_alu_type_get_type_size(slot->alu_type));
+   assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL &&
+          slot->location == sem.location);
 
-   nir_def *data = store->src[0].ssa;
-   assert(src_bit_size == data->bit_size);
+   const glsl_type *var_type = glsl_vector_type(base_type, slot->ncomps);
+   if (is_per_view)
+      var_type = glsl_array_type(var_type, PAN_MAX_MULTIVIEW_VIEW_COUNT, false);
 
-   /* Trim the input so we don't write extra channels at the end. In effect,
-    * we fill in all the intermediate "holes" in the write mask, since we
-    * can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures
-    * each varying is written at most once, anything that's masked out is
-    * undefined, so it doesn't matter what we write there. So we may as well
-    * do the simplest thing possible.
+   var = nir_local_variable_create(b->impl, var_type, "vs_out_tmp");
+   ctx->variables[slot_idx] = var;
+   return var;
+}
+
+static bool
+gather_vs_outputs(struct nir_builder *b,
+                  nir_intrinsic_instr *intr, void *cb_data)
+{
+   struct lower_vs_outputs_ctx *ctx = cb_data;
+
+   if (intr->intrinsic != nir_intrinsic_store_output &&
+       intr->intrinsic != nir_intrinsic_store_per_view_output)
+      return false;
+
+   unsigned mask = nir_intrinsic_write_mask(intr);
+   unsigned slot_idx = nir_intrinsic_base(intr);
+   unsigned component = nir_intrinsic_component(intr);
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   bool is_per_view = intr->intrinsic == nir_intrinsic_store_per_view_output;
+   unsigned view_index = is_per_view ? nir_src_as_uint(intr->src[1]) : 0;
+
+   ctx->per_view_written[slot_idx] |= BITFIELD_BIT(view_index);
+   ctx->used_buckets |= BITFIELD_BIT(va_shader_output_from_loc(sem.location));
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_variable *var = get_or_create_var(b, ctx, intr);
+
+   unsigned ncomps = glsl_get_vector_elements(glsl_without_array(var->type));
+
+   /* store_output semantics differ from store_var when we write a subset of
+    * the components, we need to re-swizzle the channels manually.
     */
-   const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store);
-   data = nir_trim_vector(b, data, util_last_bit(write_mask));
+   nir_def *stored = intr->src[0].ssa;
+   if (stored->num_components != ncomps) {
+      nir_def *undef = nir_undef(b, 1, stored->bit_size);
+      assert(ncomps <= 4);
+      nir_def *channels[4] = {undef, undef, undef, undef};
 
-   if (ctx->arch >= 9 && ctx->has_idvs) {
-      uint32_t view_index = 0;
-      if (store->intrinsic == nir_intrinsic_store_per_view_output)
-         view_index = nir_src_as_uint(store->src[1]);
+      for (unsigned i = 0; i < stored->num_components; i++) {
+         assert(component + i < 4);
+         channels[component + i] = nir_channel(b, stored, i);
+      }
 
-      build_attr_buf_write(b, data, slot, view_index, ctx);
+      stored = nir_vec(b, channels, ncomps);
+      mask = mask << component; /* mask was relative to intr->component */
    } else {
-      uint32_t base = nir_intrinsic_base(store);
-      assert(store->intrinsic != nir_intrinsic_store_per_view_output);
-      build_attr_desc_write(b, data, base, src_type, ctx);
+      assert(component == 0);
    }
 
+   if (is_per_view)
+      nir_store_array_var_imm(b, var, view_index, stored, mask);
+   else
+      nir_store_var(b, var, stored, mask);
+
    return true;
 }
 
@@ -143,14 +237,62 @@ pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
                          bool has_extended_fifo)
 {
    assert(shader->info.stage == MESA_SHADER_VERTEX);
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 
-   const struct lower_vs_outputs_ctx ctx = {
+   struct lower_vs_outputs_ctx ctx = {
       .arch = pan_arch(gpu_id),
       .varying_layout = varying_layout,
       .has_idvs = has_idvs,
       .has_extended_fifo = has_extended_fifo,
+      .variables = {NULL, },
+      .per_view_written = {0, },
+      .used_buckets = 0,
    };
-   return nir_shader_intrinsics_pass(shader, lower_vs_output_store,
-                                     nir_metadata_control_flow,
-                                     (void *)&ctx);
+   /* We use uint8 as a viewcount bitmask */
+   assert(PAN_MAX_MULTIVIEW_VIEW_COUNT <= 8 * sizeof(ctx.per_view_written[0]));
+   bool progress = nir_shader_intrinsics_pass(shader, gather_vs_outputs,
+                                              nir_metadata_control_flow,
+                                              (void *)&ctx);
+   if (!progress)
+      return false;
+
+   nir_builder builder = nir_builder_at(nir_after_impl(impl));
+   nir_builder *b = &builder;
+   nir_def *shader_output = has_idvs ? nir_load_shader_output_pan(b) : NULL;
+
+   for (int out = 0; out < VA_SHADER_OUTPUT_COUNT; out++) {
+      /* Avoid looping a lot and adding ifs (in case of IDVS) */
+      if (!(ctx.used_buckets & BITFIELD_BIT(out)))
+         continue;
+
+      nir_if *nif = NULL;
+      if (has_idvs) {
+         nir_def *should_write =
+            nir_i2b(b, nir_iand_imm(b, shader_output, BITFIELD_BIT(out)));
+         nif = nir_push_if(b, should_write);
+      }
+
+      for (unsigned idx = 0; idx < varying_layout->count; idx++) {
+         nir_variable *var = ctx.variables[idx];
+         if (var == NULL)
+            continue;
+
+         const struct pan_varying_slot *slot =
+            pan_varying_layout_slot_at(varying_layout, idx);
+         if (slot == NULL || va_shader_output_from_loc(slot->location) != out)
+            continue;
+
+         /* Non-multiview stores are treated as writing just view index 0 */
+         unsigned view_mask = ctx.per_view_written[idx];
+         assert(view_mask != 0);
+
+         u_foreach_bit(view_index, view_mask)
+            lower_vs_output_store(b, view_index, idx, &ctx);
+      }
+
+      if (nif)
+         nir_pop_if(b, nif);
+   }
+
+   return nir_progress(true, impl, nir_metadata_none);
 }