From 3418525a82fe5cbd40ecb8533d216572f6eed8c5 Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Tue, 17 Mar 2026 12:10:50 +0100
Subject: [PATCH] pan/bi: Lower VS outputs in NIR

Co-authored-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40391>
---
 src/compiler/nir/nir_divergence_analysis.c    |   3 +
 src/compiler/nir/nir_intrinsics.py            |  11 +
 .../compiler/bifrost/bifrost_compile.c        | 275 +++++++-----------
 src/panfrost/compiler/bifrost/compiler.h      |   2 -
 src/panfrost/compiler/pan_nir.h               |   4 +
 .../compiler/pan_nir_lower_varyings_io.c      | 158 ++++++++++
 6 files changed, 274 insertions(+), 179 deletions(-)

diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
index 8b591c73a0b..718c8160a0d 100644
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -755,6 +755,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
    case nir_intrinsic_load_input_attachment_target_pan:
    case nir_intrinsic_load_input_attachment_conv_pan:
    case nir_intrinsic_load_global_cvt_pan:
+   case nir_intrinsic_lea_attr_pan:
+   case nir_intrinsic_lea_buf_pan:
    case nir_intrinsic_atomic_counter_read:
    case nir_intrinsic_atomic_counter_read_deref:
    case nir_intrinsic_is_sparse_texels_resident:
@@ -1039,6 +1041,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
    case nir_intrinsic_load_tile_res_pan:
    case nir_intrinsic_load_cumulative_coverage_pan:
    case nir_intrinsic_load_blend_input_pan:
+   case nir_intrinsic_load_idvs_output_buf_index_pan:
    case nir_intrinsic_atest_pan:
    case nir_intrinsic_zs_emit_pan:
    case nir_intrinsic_load_return_param_amd:
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 76ed3fb2654..ae6410f990f 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1741,6 +1741,17 @@ store("global_cvt_pan", [1, 1], indices=[SRC_TYPE, ACCESS])
 # src[] = { value, address }
 store("global_psiz_pan", [1], indices=[WRITE_MASK, ACCESS])
 
+# Base index of the output buffer passed into the IDVS on Valhall.
+system_value("idvs_output_buf_index_pan", 1, bit_sizes=[32])
+
+# src[] = { handle, vertex_id, instance_id }
+intrinsic("lea_attr_pan", [1, 1, 1], dest_comp=3, bit_sizes=[32],
+          indices=[SRC_TYPE], flags=[CAN_ELIMINATE, CAN_REORDER])
+
+# src[] = { handle, index }
+intrinsic("lea_buf_pan", [1, 1], dest_comp=2, bit_sizes=[32],
+          flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # Load the address and potentially the conversion descriptor for a texel buffer index.
 # The 64 bit address is always in the first two channels, while the 32 bit
 # conversion descriptor is in the last channel only for Bifrost.
diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index 2a67a5b4dc2..033ce69e704 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -580,6 +580,80 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
    bi_copy_component(b, instr, dest);
 }
 
+static void
+bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->intrinsic == nir_intrinsic_lea_attr_pan);
+   const nir_alu_type src_fmt = nir_intrinsic_src_type(intr);
+
+   if (b->shader->arch < 9 && b->shader->idvs == BI_IDVS_POSITION) {
+      /* Bifrost position shaders have a fast path */
+      assert(nir_src_as_uint(intr->src[0]) == 0);
+      assert(src_fmt == nir_type_float32);
+      unsigned regfmt = BI_REGISTER_FORMAT_F32;
+      unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
+      unsigned snap4 = 0x5E;
+      uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
+      bi_collect_v3i32_to(b, bi_def_index(&intr->def),
+                          bi_preload(b, 58), bi_preload(b, 59),
+                          bi_imm_u32(format));
+      return;
+   }
+
+   bi_index vertex_id = bi_src_index(&intr->src[1]);
+   bi_index instance_id = bi_src_index(&intr->src[2]);
+   enum bi_register_format regfmt = bi_reg_fmt_for_nir(src_fmt);
+
+   /* Check if the index can fit in LEA_ATTR_IMM */
+   uint32_t imm_res = 0;
+   bool use_imm_form = false;
+   if (nir_src_is_const(intr->src[0])) {
+      imm_res = nir_src_as_uint(intr->src[0]);
+      use_imm_form = pan_res_handle_get_index(imm_res) < 0x10;
+   }
+
+   bi_index address = bi_def_index(&intr->def);
+   if (use_imm_form) {
+      bi_instr *I = bi_lea_attr_imm_to(b, address, vertex_id, instance_id,
+                                       regfmt,
+                                       pan_res_handle_get_index(imm_res));
+      if (b->shader->arch >= 9)
+         I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res));
+   } else {
+      bi_index res = bi_src_index(&intr->src[0]);
+      bi_lea_attr_to(b, address, vertex_id, instance_id, res, regfmt);
+   }
+   bi_split_def(b, &intr->def);
+}
+
+static void
+bi_emit_lea_buf(bi_builder *b, nir_intrinsic_instr *intr)
+{
+   assert(intr->intrinsic == nir_intrinsic_lea_buf_pan);
+   assert(b->shader->arch >= 9);
+   bi_index index = bi_src_index(&intr->src[1]);
+
+   uint32_t imm_res;
+   bool use_imm_form = false;
+   if (nir_src_is_const(intr->src[0])) {
+      imm_res = nir_src_as_uint(intr->src[0]);
+      uint32_t table_index = pan_res_handle_get_table(imm_res);
+      uint32_t res_index = pan_res_handle_get_index(imm_res);
+      use_imm_form = va_is_valid_const_table(table_index) && res_index < 256;
+   }
+
+   bi_index address = bi_def_index(&intr->def);
+   if (use_imm_form) {
+      bi_instr *I = bi_lea_buf_imm_to(b, address, index);
+      I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res));
+      I->index = pan_res_handle_get_index(imm_res);
+   } else {
+      bi_index res = bi_src_index(&intr->src[0]);
+      bi_lea_buf_to(b, address, index, res);
+   }
+   bi_split_def(b, &intr->def);
+}
+
 static void
 bi_emit_load_var(bi_builder *b, nir_intrinsic_instr *intr)
 {
@@ -1046,169 +1120,6 @@ bifrost_nir_lower_vs_atomics(nir_shader *shader)
                                      nir_metadata_none, NULL);
 }
 
-static void
-bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
-{
-   /* In principle we can do better for 16-bit. At the moment we require
-    * mediump varyings to be 32-bit to permit the use of .auto, in order to
-    * force .u32 for flat varyings, to handle internal TGSI shaders that set
-    * flat in the VS but smooth in the FS.
-    *
-    * Explicit 16-bit types are unaffected, and written as 16-bit. */
-
-   ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
-   ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
-   nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-
-   const struct pan_varying_slot *slot =
-      pan_varying_layout_find_slot(b->shader->varying_layout, sem.location);
-   ASSERTED unsigned base = nir_intrinsic_base(instr);
-   assert(slot == &b->shader->varying_layout->slots[base]);
-
-   unsigned imm_index = 0;
-   bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
-
-   /* Only look at the total components needed. In effect, we fill in all
-    * the intermediate "holes" in the write mask, since we can't mask off
-    * stores. Since nir_lower_io_vars_to_temporaries ensures each varying is
-    * written at most once, anything that's masked out is undefined, so it
-    * doesn't matter what we write there. So we may as well do the
-    * simplest thing possible. */
-   unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
-   assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
-
-   bi_index data = bi_src_index(&instr->src[0]);
-
-   /* To keep the vector dimensions consistent, we need to drop some
-    * components. This should be coalesced.
-    *
-    * TODO: This is ugly and maybe inefficient. Would we rather
-    * introduce a TRIM.i32 pseudoinstruction?
-    */
-   if (nr < nir_intrinsic_src_components(instr, 0)) {
-      bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
-      unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
-      unsigned src_comps =
-         DIV_ROUND_UP(nir_intrinsic_src_components(instr, 0), comps_per_reg);
-      unsigned dst_comps = DIV_ROUND_UP(nr, comps_per_reg);
-
-      bi_emit_split_i32(b, chans, data, src_comps);
-
-      bi_index tmp = bi_temp(b->shader);
-      bi_instr *collect = bi_collect_i32_to(b, tmp, dst_comps);
-
-      bi_foreach_src(collect, w)
-         collect->src[w] = chans[w];
-
-      data = tmp;
-   }
-
-   bi_index a[4] = {bi_null()};
-
-   if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
-      /* Bifrost position shaders have a fast path */
-      assert(T == nir_type_float32);
-      unsigned regfmt = BI_REGISTER_FORMAT_F32;
-      unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
-      unsigned snap4 = 0x5E;
-      uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
-
-      bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
-                bi_imm_u32(format), regfmt, nr - 1);
-   } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
-      bi_index index = bi_preload(b, 59);
-      unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
-
-      unsigned index_offset = 0;
-      if (slot->section == PAN_VARYING_SECTION_ATTRIBS)
-         index_offset += 4;
-
-      if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
-         unsigned view_index = nir_src_as_uint(instr->src[1]);
-
-         if (slot->section == PAN_VARYING_SECTION_GENERIC) {
-            index_offset += view_index * 4;
-         } else {
-            /* We don't patch these offsets in the no_psiz variant, so if
-             * multiview is enabled we can't switch to the basic format by
-             * using no_psiz */
-            const uint64_t outputs = b->shader->nir->info.outputs_written;
-            bool extended_position_fifo =
-               valhal_writes_extended_fifo(outputs, false, true);
-            /* Must be the same with and without no_psiz */
-            assert(valhal_writes_extended_fifo(outputs, true, true) ==
-                   extended_position_fifo);
-            unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
-            index_offset += view_index * position_fifo_stride;
-         }
-      }
-
-      if (index_offset != 0)
-         index = bi_iadd_imm_i32(b, index, index_offset);
-
-      const enum va_memory_access mem_access =
-         slot->section == PAN_VARYING_SECTION_GENERIC ? VA_MEMORY_ACCESS_ESTREAM
-                                                      : VA_MEMORY_ACCESS_ISTREAM;
-
-      nir_src *offset_src = nir_get_io_offset_src(instr);
-      assert(nir_src_is_const(*offset_src) && "assumes immediate offset");
-      unsigned offset = slot->offset + (nir_src_as_uint(*offset_src) * 16);
-
-      /* On Valhall, with IDVS varying are stored in a hardware-controlled
-       * buffer through table 61 at index 0 */
-      bi_index address = bi_temp(b->shader);
-      bi_instr *I = bi_lea_buf_imm_to(b, address, index);
-      I->table = va_res_fold_table_idx(61);
-      I->index = 0;
-
-      /* On 5th Gen, the hardware-controlled buffer is at index 1 for varyings */
-      if (pan_arch(b->shader->inputs->gpu_id) >= 12 &&
-          slot->section == PAN_VARYING_SECTION_GENERIC) {
-         I->index = 1;
-      }
-
-      bi_emit_split_i32(b, a, address, 2);
-
-      bi_instr *S = bi_store(b, nr * src_bit_sz, data, a[0], a[1], BI_SEG_NONE,
-                             offset);
-      S->mem_access = mem_access;
-      S->is_psiz_write = slot->location == VARYING_SLOT_PSIZ;
-   } else {
-      assert(T_size == 32 || T_size == 16);
-
-      enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
-
-      /* Since v9 we cannot have separate attribute descriptors for VS-FS,
-       * There might be a mismatch on Gallium where the VS thinks it is storing
-       * an int, but the data is actually a float, and that's what FS expects.
-       * So, just for v9 onwards, just until we haven't fixed gallium, use auto32.
-       * We are still getting around the midgard quirk since we do this only
-       * from v9.
-       * TODO: fix all bugs with gallium and remove this patch
-       */
-      if (b->shader->arch >= 9 && T_size == 32)
-         regfmt = BI_REGISTER_FORMAT_AUTO;
-
-      if (immediate) {
-         bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b),
-                                            bi_instance_id(b),
-                                            regfmt, imm_index);
-         bi_emit_split_i32(b, a, address, 3);
-
-         bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
-      } else {
-         bi_index idx = bi_iadd_u32(b,
-            bi_src_index(nir_get_io_offset_src(instr)),
-            bi_imm_u32(nir_intrinsic_base(instr)), false);
-         bi_index address =
-            bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
-         bi_emit_split_i32(b, a, address, 3);
-
-         bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
-      }
-   }
-}
-
 static void
 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
 {
@@ -2047,16 +1958,6 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
       bi_emit_load_var_buf(b, instr);
       break;
 
-   case nir_intrinsic_store_output:
-   case nir_intrinsic_store_per_view_output:
-      if (stage == MESA_SHADER_FRAGMENT)
-         UNREACHABLE("Should have been lowered by pan_nir_lower_fs_outputs");
-      else if (stage == MESA_SHADER_VERTEX)
-         bi_emit_store_vary(b, instr);
-      else
-         UNREACHABLE("Unsupported shader stage");
-      break;
-
    case nir_intrinsic_load_cumulative_coverage_pan:
       bi_mov_i32_to(b, dst, bi_preload(b, 60));
       break;
@@ -2335,6 +2236,18 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
       bi_emit_store_cvt(b, instr, va_memory_access_from_nir(instr));
       break;
 
+   case nir_intrinsic_load_idvs_output_buf_index_pan:
+      bi_mov_i32_to(b, dst, bi_preload(b, 59));
+      break;
+
+   case nir_intrinsic_lea_attr_pan:
+      bi_emit_lea_attr(b, instr);
+      break;
+
+   case nir_intrinsic_lea_buf_pan:
+      bi_emit_lea_buf(b, instr);
+      break;
+
    case nir_intrinsic_load_tile_pan:
    case nir_intrinsic_load_tile_res_pan:
       bi_emit_ld_tile(b, instr);
@@ -6631,10 +6544,6 @@ bi_compile_variant_nir(nir_shader *nir,
    ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
    ctx->fau_consts_count = info.init_fau_consts_count;
 
-   if (!mesa_shader_stage_is_compute(nir->info.stage)) {
-      ctx->varying_layout = inputs->varying_layout;
-   }
-
    unsigned execution_mode = nir->info.float_controls_execution_mode;
    ctx->rtz_fp16 = nir_is_rounding_mode_rtz(execution_mode, 16);
    ctx->rtz_fp32 = nir_is_rounding_mode_rtz(execution_mode, 32);
@@ -7098,6 +7007,18 @@ bifrost_compile_shader_nir(nir_shader *nir,
             NIR_PASS(_, nir, nir_opt_if, 0);
          }
       }
+
+      bool has_extended_fifo = false;
+      if (pan_arch(inputs->gpu_id) >= 9) {
+         const uint64_t outputs = nir->info.outputs_written;
+         has_extended_fifo = valhal_writes_extended_fifo(outputs, false, true);
+         /* Must be the same with and without no_psiz */
+         assert(valhal_writes_extended_fifo(outputs, true, true) ==
+                has_extended_fifo);
+      }
+
+      NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id,
+               inputs->varying_layout, info->vs.idvs, has_extended_fifo);
    }
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h
index a9fe1ddd837..c8f999e6e0b 100644
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@@ -1057,8 +1057,6 @@ typedef struct {
    enum bi_idvs_mode idvs;
    unsigned num_blocks;
 
-   const struct pan_varying_layout *varying_layout;
-
    /* Floating point rounding mode controls */
    bool rtz_fp16;
    bool rtz_fp32;
diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h
index 97ca12711e4..9e22a5d10a5 100644
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@@ -57,6 +57,10 @@ bool pan_nir_lower_frag_coord_zw(nir_shader *shader);
 bool pan_nir_lower_noperspective_vs(nir_shader *shader);
 bool pan_nir_lower_noperspective_fs(nir_shader *shader);
 
+bool pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
+                              const struct pan_varying_layout *varying_layout,
+                              bool has_idvs, bool has_extended_fifo);
+
 bool pan_nir_lower_fs_inputs(nir_shader *shader, unsigned gpu_id,
                              const struct pan_varying_layout *varying_layout,
                              bool valhall_use_ld_var_buf);
diff --git a/src/panfrost/compiler/pan_nir_lower_varyings_io.c b/src/panfrost/compiler/pan_nir_lower_varyings_io.c
index b90cf8d7935..42362c2763f 100644
--- a/src/panfrost/compiler/pan_nir_lower_varyings_io.c
+++ b/src/panfrost/compiler/pan_nir_lower_varyings_io.c
@@ -8,6 +8,164 @@
 
 #include "panfrost/model/pan_model.h"
 
+struct lower_vs_outputs_ctx {
+   unsigned arch;
+   const struct pan_varying_layout *varying_layout;
+   bool has_idvs;
+   bool has_extended_fifo;
+};
+
+static void
+build_attr_buf_write(struct nir_builder *b, nir_def *data,
+                     const struct pan_varying_slot *slot, uint32_t view_index,
+                     const struct lower_vs_outputs_ctx *ctx)
+{
+   /* We need the precise memory layout */
+   pan_varying_layout_require_layout(ctx->varying_layout);
+
+   nir_def *index = nir_load_idvs_output_buf_index_pan(b);
+
+   uint32_t res, view_stride;
+   if (slot->section == PAN_VARYING_SECTION_GENERIC) {
+      /* The varying buffer is bound at index 1 on v12+ */
+      uint32_t res_index = ctx->arch >= 12 ? 1 : 0;
+      res = pan_res_handle(61, res_index);
+      view_stride = 4;
+   } else {
+      res = pan_res_handle(61, 0);
+      view_stride = ctx->has_extended_fifo ? 8 : 4;
+   }
+
+   uint32_t index_offset = view_index * view_stride;
+   if (slot->section == PAN_VARYING_SECTION_ATTRIBS)
+      index_offset += 4;
+
+   /* v9+ cache hints, generic varyings don't need caching while
+    * position/attribute varyings are reused by other units inside of the GPU.
+    * TODO: Do we really want ESTREAM on generic varyings?
+    */
+   enum gl_access_qualifier access =
+      slot->section == PAN_VARYING_SECTION_GENERIC ? ACCESS_ESTREAM_PAN :
+                                                     ACCESS_ISTREAM_PAN;
+
+   index = nir_iadd_imm(b, index, index_offset);
+   nir_def *addr = nir_lea_buf_pan(b, nir_imm_int(b, res), index);
+   addr = nir_pack_64_2x32(b, addr);
+   addr = nir_iadd(b, addr, nir_imm_int64(b, slot->offset));
+
+   /* Tag writes to gl_PointSize with a special intrinsic */
+   if (slot->location == VARYING_SLOT_PSIZ) {
+      nir_store_global_psiz_pan(b, data, addr, .access = access);
+   } else {
+      nir_store_global(b, data, addr, .access = access);
+   }
+}
+
+static void
+build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base,
+                      nir_alu_type src_type,
+                      const struct lower_vs_outputs_ctx *ctx)
+{
+   nir_def *index = nir_imm_int(b, base);
+   nir_def *vertex_id = nir_load_raw_vertex_id_pan(b);
+   nir_def *instance_id = nir_load_instance_id(b);
+
+   nir_def *addr_cvt = nir_lea_attr_pan(b, index, vertex_id, instance_id,
+                                        .src_type = src_type);
+   nir_def *addr = nir_pack_64_2x32(b, nir_trim_vector(b, addr_cvt, 2));
+   nir_def *cvt = nir_channel(b, addr_cvt, 2);
+
+   nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type);
+}
+
+static bool
+lower_vs_output_store(struct nir_builder *b,
+                      nir_intrinsic_instr *store, void *cb_data)
+{
+   const struct lower_vs_outputs_ctx *ctx = cb_data;
+
+   if (store->intrinsic != nir_intrinsic_store_output &&
+       store->intrinsic != nir_intrinsic_store_per_view_output)
+      return false;
+
+   b->cursor = nir_instr_remove(&store->instr);
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(store);
+   nir_alu_type src_type = nir_intrinsic_src_type(store);
+   unsigned src_bit_size = nir_alu_type_get_type_size(src_type);
+
+   /* Indirect array varyings are not yet supported (num_slots > 1) */
+   assert(sem.num_slots == 1);
+   assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0);
+
+   /* We need the slot section for cache hints */
+   pan_varying_layout_require_format(ctx->varying_layout);
+   const struct pan_varying_slot *slot =
+      pan_varying_layout_find_slot(ctx->varying_layout, sem.location);
+   /* Special slots are read only */
+   assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL);
+   /* From v9, IO is resized to the real size of the slot */
+   assert(ctx->arch < 9 ||
+          src_bit_size == nir_alu_type_get_type_size(slot->alu_type));
+
+   /* Since v9 we cannot have separate attribute descriptors for VS-FS,
+    * There might be a mismatch on Gallium where the VS thinks it is storing
+    * an int, but the data is actually a float, and that's what FS expects.
+    * So, just for v9 onwards, just until we haven't fixed gallium, use auto32.
+    * We are still getting around the midgard quirk since we do this only
+    * from v9.
+    * TODO: fix all bugs with gallium and remove this patch
+    */
+   if (ctx->arch >= 9 && src_bit_size == 32)
+      src_type = 32;
+
+   nir_def *data = store->src[0].ssa;
+   assert(src_bit_size == data->bit_size);
+
+   /* Trim the input so we don't write extra channels at the end. In effect,
+    * we fill in all the intermediate "holes" in the write mask, since we
+    * can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures
+    * each varying is written at most once, anything that's masked out is
+    * undefined, so it doesn't matter what we write there. So we may as well
+    * do the simplest thing possible.
+    */
+   const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store);
+   data = nir_trim_vector(b, data, util_last_bit(write_mask));
+
+   if (ctx->arch >= 9 && ctx->has_idvs) {
+      uint32_t view_index = 0;
+      if (store->intrinsic == nir_intrinsic_store_per_view_output)
+         view_index = nir_src_as_uint(store->src[1]);
+
+      build_attr_buf_write(b, data, slot, view_index, ctx);
+   } else {
+      uint32_t base = nir_intrinsic_base(store);
+      assert(store->intrinsic != nir_intrinsic_store_per_view_output);
+      build_attr_desc_write(b, data, base, src_type, ctx);
+   }
+
+   return true;
+}
+
+bool
+pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
+                         const struct pan_varying_layout *varying_layout,
+                         bool has_idvs,
+                         bool has_extended_fifo)
+{
+   assert(shader->info.stage == MESA_SHADER_VERTEX);
+
+   const struct lower_vs_outputs_ctx ctx = {
+      .arch = pan_arch(gpu_id),
+      .varying_layout = varying_layout,
+      .has_idvs = has_idvs,
+      .has_extended_fifo = has_extended_fifo,
+   };
+   return nir_shader_intrinsics_pass(shader, lower_vs_output_store,
+                                     nir_metadata_control_flow,
+                                     (void *)&ctx);
+}
+
 struct lower_fs_inputs_ctx {
    unsigned arch;
    const struct pan_varying_layout *varying_layout;