From f0e1ccc8d490ded39b7f248fb9d564e167422b92 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Sun, 3 Mar 2024 10:09:08 -0400
Subject: [PATCH] asahi: rewrite varying linking

Lower store_output to store_uvs_agx + math. Link UVS indices at draw-time
instead of compile-time to get efficient separate shaders. Also picks up varying
compaction along the way.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
---
 src/asahi/compiler/agx_compile.c              | 153 +----------
 src/asahi/compiler/agx_compile.h              |  60 -----
 .../compiler/agx_nir_lower_clip_distance.c    |  42 ---
 src/asahi/compiler/agx_nir_lower_layer.c      |  70 -----
 src/asahi/compiler/meson.build                |   2 -
 src/asahi/lib/agx_nir_lower_uvs.c             | 251 ++++++++++++++++++
 src/asahi/lib/agx_nir_passes.h                |   3 +
 src/asahi/lib/agx_uvs.h                       |  79 ++++++
 src/asahi/lib/meson.build                     |   1 +
 src/gallium/drivers/asahi/agx_disk_cache.c    |   2 +
 .../drivers/asahi/agx_nir_lower_sysvals.c     |   3 +
 src/gallium/drivers/asahi/agx_state.c         | 227 ++++++----------
 src/gallium/drivers/asahi/agx_state.h         |  17 +-
 13 files changed, 436 insertions(+), 474 deletions(-)
 delete mode 100644 src/asahi/compiler/agx_nir_lower_clip_distance.c
 delete mode 100644 src/asahi/compiler/agx_nir_lower_layer.c
 create mode 100644 src/asahi/lib/agx_nir_lower_uvs.c
 create mode 100644 src/asahi/lib/agx_uvs.h

diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index de45d05815a..730cb1f4b01 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -578,39 +578,6 @@ agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
    agx_emit_cached_split(b, dest, components);
 }
 
-static agx_instr *
-agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
-{
-   nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-   nir_src *offset = nir_get_io_offset_src(instr);
-   assert(nir_src_is_const(*offset) && "todo: indirects");
-
-   unsigned imm_index = b->shader->out->varyings.vs.slots[sem.location];
-
-   if (sem.location == VARYING_SLOT_LAYER ||
-       sem.location == VARYING_SLOT_CLIP_DIST0) {
-      /* Separate slots used for the sysval vs the varying. The default slot
-       * above is for the varying. Change for the sysval.
-       */
-      assert(sem.no_sysval_output || sem.no_varying);
-
-      if (sem.no_varying) {
-         imm_index = sem.location == VARYING_SLOT_LAYER
-                        ? b->shader->out->varyings.vs.layer_viewport_slot
-                        : b->shader->out->varyings.vs.clip_dist_slot;
-      }
-   }
-
-   assert(imm_index < ~0);
-   imm_index += (nir_src_as_uint(*offset) * 4) + nir_intrinsic_component(instr);
-
-   /* nir_lower_io_to_scalar */
-   assert(nir_intrinsic_write_mask(instr) == 0x1);
-
-   return agx_st_vary(b, agx_immediate(imm_index),
-                      agx_src_index(&instr->src[0]));
-}
-
 static agx_instr *
 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
 {
@@ -1210,9 +1177,10 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
       agx_emit_load(b, dst, instr);
       return NULL;
 
-   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_uvs_agx:
       assert(stage == MESA_SHADER_VERTEX);
-      return agx_emit_store_vary(b, instr);
+      return agx_st_vary(b, agx_src_index(&instr->src[1]),
+                         agx_src_index(&instr->src[0]));
 
    case nir_intrinsic_store_agx:
       agx_emit_store(b, instr);
@@ -2667,96 +2635,6 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
 }
 
-/* ABI: position first, then user, then psiz */
-static void
-agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings,
-                      struct agx_shader_key *key)
-{
-   unsigned base = 0;
-
-   /* Initialize to "nothing is written" */
-   for (unsigned i = 0; i < ARRAY_SIZE(varyings->slots); ++i)
-      varyings->slots[i] = ~0;
-
-   /* gl_Position is implicitly written, although it may validly be absent in
-    * vertex programs run only for transform feedback. Those ignore their
-    * varyings so it doesn't matter what we do here as long as we don't fail.
-    */
-   varyings->slots[VARYING_SLOT_POS] = base;
-   base += 4;
-
-   /* These are always flat-shaded from the FS perspective */
-   key->vs.outputs_flat_shaded |= VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
-
-   /* The internal cull distance slots are always linearly-interpolated */
-   key->vs.outputs_linear_shaded |=
-      BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE, 2);
-
-   assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded));
-
-   /* Smooth 32-bit user bindings go next */
-   u_foreach_bit64(loc, nir->info.outputs_written &
-                           ~key->vs.outputs_flat_shaded &
-                           ~key->vs.outputs_linear_shaded) {
-      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
-         continue;
-
-      assert(loc < ARRAY_SIZE(varyings->slots));
-      varyings->slots[loc] = base;
-      base += 4;
-      varyings->num_32_smooth += 4;
-   }
-
-   /* Flat 32-bit user bindings go next */
-   u_foreach_bit64(loc,
-                   nir->info.outputs_written & key->vs.outputs_flat_shaded) {
-      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
-         continue;
-
-      assert(loc < ARRAY_SIZE(varyings->slots));
-      varyings->slots[loc] = base;
-      base += 4;
-      varyings->num_32_flat += 4;
-   }
-
-   /* Linear 32-bit user bindings go next */
-   u_foreach_bit64(loc,
-                   nir->info.outputs_written & key->vs.outputs_linear_shaded) {
-      if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
-         continue;
-
-      assert(loc < ARRAY_SIZE(varyings->slots));
-      varyings->slots[loc] = base;
-      base += 4;
-      varyings->num_32_linear += 4;
-   }
-
-   /* TODO: Link FP16 varyings */
-   varyings->base_index_fp16 = base;
-   varyings->num_16_smooth = 0;
-   varyings->num_16_flat = 0;
-   varyings->num_16_linear = 0;
-
-   if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
-      varyings->slots[VARYING_SLOT_PSIZ] = base;
-      base += 1;
-   }
-
-   if (nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) {
-      varyings->layer_viewport_slot = base;
-      base += 1;
-   }
-
-   if (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0) {
-      varyings->clip_dist_slot = base;
-      varyings->nr_clip_dists = nir->info.clip_distance_array_size;
-      base += varyings->nr_clip_dists;
-   }
-
-   /* All varyings linked now */
-   varyings->nr_index = base;
-}
-
 /*
  * Varyings that are used as texture coordinates should be kept at fp32, because
  * fp16 does not have enough precision for large textures. It's technically
@@ -3188,10 +3066,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       out->tag_write_disable = !nir->info.writes_memory;
 
-   if (nir->info.stage == MESA_SHADER_VERTEX &&
-       (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0))
-      NIR_PASS(_, nir, agx_nir_lower_clip_distance);
-
    bool needs_libagx = true /* TODO: Optimize */;
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
@@ -3238,19 +3112,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
    /* Late VBO lowering creates constant udiv instructions */
    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
 
-   /* Varying output is scalar, other I/O is vector. Lowered late because
-    * transform feedback programs will use vector output.
-    */
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
-
-      if (nir->info.outputs_written &
-          (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) {
-
-         NIR_PASS(_, nir, agx_nir_lower_layer);
-      }
-   }
-
    NIR_PASS(_, nir, nir_opt_constant_folding);
    NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
             nir_metadata_block_index | nir_metadata_dominance, NULL);
@@ -3258,10 +3119,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
    out->push_count = key->reserved_preamble;
    agx_optimize_nir(nir, &out->push_count);
 
-   /* Must be last since NIR passes can remap driver_location freely */
-   if (nir->info.stage == MESA_SHADER_VERTEX)
-      agx_remap_varyings_vs(nir, &out->varyings.vs, key);
-   else if (nir->info.stage == MESA_SHADER_FRAGMENT)
+   if (nir->info.stage == MESA_SHADER_FRAGMENT)
       assign_coefficient_regs(nir, &out->varyings.fs);
 
    if (agx_should_dump(nir, AGX_DBG_SHADERS))
@@ -3284,9 +3142,6 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
    }
 
    if (nir->info.stage == MESA_SHADER_VERTEX) {
-      out->writes_psiz =
-         nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ);
-
       out->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
 
       out->writes_layer_viewport =
diff --git a/src/asahi/compiler/agx_compile.h b/src/asahi/compiler/agx_compile.h
index 39064cedcb3..811c1cb4407 100644
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -9,50 +9,6 @@
 #include "util/u_dynarray.h"
 #include "shader_enums.h"
 
-struct agx_varyings_vs {
-   /* The number of user varyings of each type. The varyings must be allocated
-    * in this order ({smooth, flat, linear} × {32, 16}), which may require
-    * remapping.
-    */
-   unsigned num_32_smooth;
-   unsigned num_32_flat;
-   unsigned num_32_linear;
-   unsigned num_16_smooth;
-   unsigned num_16_flat;
-   unsigned num_16_linear;
-
-   /* The first index used for FP16 varyings. Indices less than this are treated
-    * as FP32. This may require remapping slots to guarantee.
-    */
-   unsigned base_index_fp16;
-
-   /* The total number of vertex shader indices output. Must be at least
-    * base_index_fp16.
-    */
-   unsigned nr_index;
-
-   /* If the slot is written, this is the base index that the first component
-    * of the slot is written to.  The next components are found in the next
-    * indices. If less than base_index_fp16, this is a 32-bit slot (with 4
-    * indices for the 4 components), else this is a 16-bit slot (with 2
-    * indices for the 4 components). This must be less than nr_index.
-    *
-    * If the slot is not written, this must be ~0.
-    */
-   unsigned slots[VARYING_SLOT_MAX];
-
-   /* Slot for the combined layer/viewport 32-bit sysval output, or ~0 if none
-    * is written. What's at slots[VARYING_SLOT_LAYER] is the varying output.
-    */
-   unsigned layer_viewport_slot;
-
-   /* Base slot for the clip distance sysval outputs, or ~0 if none is written.
-    * What's at slots[VARYING_SLOT_CLIP_DIST0] is the varying output.
-    */
-   unsigned clip_dist_slot;
-   unsigned nr_clip_dists;
-};
-
 struct agx_cf_binding {
    /* Base coefficient register */
    unsigned cf_base;
@@ -96,7 +52,6 @@ struct agx_varyings_fs {
 };
 
 union agx_varyings {
-   struct agx_varyings_vs vs;
    struct agx_varyings_fs fs;
 };
 
@@ -127,9 +82,6 @@ struct agx_shader_info {
    /* Does the shader read the tilebuffer? */
    bool reads_tib;
 
-   /* Does the shader write point size? */
-   bool writes_psiz;
-
    /* Does the shader potentially draw to a nonzero viewport? */
    bool nonzero_viewport;
 
@@ -195,17 +147,6 @@ enum agx_format {
    AGX_NUM_FORMATS,
 };
 
-struct agx_vs_shader_key {
-   /* The GPU ABI requires all smooth shaded varyings to come first, then all
-    * flat shaded varyings, then all linear shaded varyings, as written by the
-    * VS. In order to correctly remap the varyings into the right order in the
-    * VS, we need to propagate the mask of flat/linear shaded varyings into the
-    * compiler.
-    */
-   uint64_t outputs_flat_shaded;
-   uint64_t outputs_linear_shaded;
-};
-
 struct agx_fs_shader_key {
    /* Normally, access to the tilebuffer must be guarded by appropriate fencing
     * instructions to ensure correct results in the presence of out-of-order
@@ -246,7 +187,6 @@ struct agx_shader_key {
    bool promote_constants;
 
    union {
-      struct agx_vs_shader_key vs;
       struct agx_fs_shader_key fs;
    };
 };
diff --git a/src/asahi/compiler/agx_nir_lower_clip_distance.c b/src/asahi/compiler/agx_nir_lower_clip_distance.c
deleted file mode 100644
index 589a1b48ed9..00000000000
--- a/src/asahi/compiler/agx_nir_lower_clip_distance.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2023 Valve Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "compiler/nir/nir.h"
-#include "compiler/nir/nir_builder.h"
-#include "agx_nir.h"
-
-static bool
-lower(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
-{
-   if (intr->intrinsic != nir_intrinsic_store_output)
-      return false;
-
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-   if (sem.location != VARYING_SLOT_CLIP_DIST0)
-      return false;
-
-   nir_instr *clone = nir_instr_clone(b->shader, &intr->instr);
-   nir_intrinsic_instr *lowered = nir_instr_as_intrinsic(clone);
-
-   b->cursor = nir_after_instr(&intr->instr);
-   nir_builder_instr_insert(b, clone);
-
-   nir_io_semantics new_sem = sem;
-   new_sem.no_varying = true;
-   nir_intrinsic_set_io_semantics(lowered, new_sem);
-
-   sem.no_sysval_output = true;
-   nir_intrinsic_set_io_semantics(intr, sem);
-   return true;
-}
-
-bool
-agx_nir_lower_clip_distance(nir_shader *s)
-{
-   assert(s->info.outputs_written & VARYING_BIT_CLIP_DIST0);
-
-   return nir_shader_intrinsics_pass(
-      s, lower, nir_metadata_block_index | nir_metadata_dominance, NULL);
-}
diff --git a/src/asahi/compiler/agx_nir_lower_layer.c b/src/asahi/compiler/agx_nir_lower_layer.c
deleted file mode 100644
index 646eb73ab1f..00000000000
--- a/src/asahi/compiler/agx_nir_lower_layer.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright 2023 Valve Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "compiler/nir/nir.h"
-#include "compiler/nir/nir_builder.h"
-#include "agx_nir.h"
-
-bool
-agx_nir_lower_layer(nir_shader *s)
-{
-   assert(s->info.stage == MESA_SHADER_VERTEX);
-   assert(s->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT));
-
-   /* Writes are in the last block, search */
-   nir_function_impl *impl = nir_shader_get_entrypoint(s);
-   nir_block *last = nir_impl_last_block(impl);
-
-   nir_def *layer = NULL, *viewport = NULL;
-   nir_cursor last_cursor;
-
-   nir_foreach_instr(instr, last) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *store = nir_instr_as_intrinsic(instr);
-      if (store->intrinsic != nir_intrinsic_store_output)
-         continue;
-
-      nir_io_semantics sem = nir_intrinsic_io_semantics(store);
-      nir_def *value = store->src[0].ssa;
-
-      if (sem.location == VARYING_SLOT_LAYER) {
-         assert(layer == NULL && "only written once");
-         layer = value;
-      } else if (sem.location == VARYING_SLOT_VIEWPORT) {
-         assert(viewport == NULL && "only written once");
-         viewport = value;
-      } else {
-         continue;
-      }
-
-      last_cursor = nir_after_instr(&store->instr);
-
-      /* Leave the store as a varying-only, no sysval output */
-      sem.no_sysval_output = true;
-      nir_intrinsic_set_io_semantics(store, sem);
-   }
-
-   assert((layer || viewport) && "metadata inconsistent with program");
-
-   /* Pack together and write out */
-   nir_builder b = nir_builder_at(last_cursor);
-
-   nir_def *zero = nir_imm_intN_t(&b, 0, 16);
-   nir_def *packed =
-      nir_pack_32_2x16_split(&b, layer ? nir_u2u16(&b, layer) : zero,
-                             viewport ? nir_u2u16(&b, viewport) : zero);
-
-   /* Written with a sysval-only store, no varying output */
-   nir_store_output(&b, packed, nir_imm_int(&b, 0),
-                    .io_semantics.location = VARYING_SLOT_LAYER,
-                    .io_semantics.num_slots = 1,
-                    .io_semantics.no_varying = true);
-
-   nir_metadata_preserve(impl,
-                         nir_metadata_dominance | nir_metadata_block_index);
-   return true;
-}
diff --git a/src/asahi/compiler/meson.build b/src/asahi/compiler/meson.build
index 172455db223..18c99041fe8 100644
--- a/src/asahi/compiler/meson.build
+++ b/src/asahi/compiler/meson.build
@@ -8,13 +8,11 @@ libasahi_agx_files = files(
   'agx_liveness.c',
   'agx_insert_waits.c',
   'agx_nir_lower_address.c',
-  'agx_nir_lower_clip_distance.c',
   'agx_nir_lower_cull_distance.c',
   'agx_nir_lower_frag_sidefx.c',
   'agx_nir_lower_sample_mask.c',
   'agx_nir_lower_discard_zs_emit.c',
   'agx_nir_lower_interpolation.c',
-  'agx_nir_lower_layer.c',
   'agx_nir_lower_shared_bitsize.c',
   'agx_nir_lower_subgroups.c',
   'agx_nir_opt_preamble.c',
diff --git a/src/asahi/lib/agx_nir_lower_uvs.c b/src/asahi/lib/agx_nir_lower_uvs.c
new file mode 100644
index 00000000000..a3edb051188
--- /dev/null
+++ b/src/asahi/lib/agx_nir_lower_uvs.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "agx_compile.h"
+#include "agx_pack.h"
+#include "agx_uvs.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "shader_enums.h"
+
+struct ctx {
+   nir_def *layer, *viewport;
+   nir_cursor after_layer_viewport;
+   struct agx_unlinked_uvs_layout *layout;
+};
+
+static enum uvs_group
+group_for_varying(gl_varying_slot loc)
+{
+   switch (loc) {
+   case VARYING_SLOT_POS:
+      return UVS_POSITION;
+   case VARYING_SLOT_PSIZ:
+      return UVS_PSIZ;
+   default:
+      return UVS_VARYINGS;
+   }
+}
+
+static bool
+lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct ctx *ctx = data;
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   unsigned component = nir_intrinsic_component(intr);
+
+   nir_def *value = intr->src[0].ssa;
+   nir_def *offset = intr->src[1].ssa;
+
+   /* If there is only 1 user varying, it is at the base of the varying section.
+    * This saves us an indirection on simple separate shaders.
+    */
+   bool single_vary = util_is_power_of_two_nonzero64(ctx->layout->written);
+   enum uvs_group group = group_for_varying(sem.location);
+
+   nir_def *base;
+   if ((group == UVS_VARYINGS) && !single_vary)
+      base = nir_load_uvs_index_agx(b, .io_semantics = sem);
+   else
+      base = nir_imm_intN_t(b, ctx->layout->group_offs[group], 16);
+
+   nir_def *index = nir_iadd(b, nir_iadd_imm(b, base, component),
+                             nir_imul_imm(b, nir_u2u16(b, offset), 4));
+
+   nir_intrinsic_instr *new_store = nir_store_uvs_agx(b, value, index);
+
+   /* Insert clip distance sysval writes, and gather layer/viewport writes so we
+    * can accumulate their system value. These are still lowered like normal to
+    * write them for the varying FS input.
+    */
+   if (sem.location == VARYING_SLOT_LAYER) {
+      assert(ctx->layer == NULL && "only written once");
+      ctx->layer = value;
+      ctx->after_layer_viewport = nir_after_instr(&new_store->instr);
+   } else if (sem.location == VARYING_SLOT_VIEWPORT) {
+      assert(ctx->viewport == NULL && "only written once");
+      ctx->viewport = value;
+      ctx->after_layer_viewport = nir_after_instr(&new_store->instr);
+   } else if (sem.location == VARYING_SLOT_CLIP_DIST0) {
+      unsigned clip_base = ctx->layout->group_offs[UVS_CLIP_DIST];
+      nir_def *index = nir_iadd_imm(b, nir_imul_imm(b, nir_u2u16(b, offset), 4),
+                                    clip_base + component);
+
+      nir_store_uvs_agx(b, value, index);
+   }
+
+   return true;
+}
+
+static void
+write_layer_viewport_sysval(struct ctx *ctx)
+{
+   nir_builder b = nir_builder_at(ctx->after_layer_viewport);
+
+   nir_def *zero = nir_imm_intN_t(&b, 0, 16);
+   nir_def *layer = ctx->layer ? nir_u2u16(&b, ctx->layer) : zero;
+   nir_def *viewport = ctx->viewport ? nir_u2u16(&b, ctx->viewport) : zero;
+
+   nir_store_uvs_agx(
+      &b, nir_pack_32_2x16_split(&b, layer, viewport),
+      nir_imm_int(&b, ctx->layout->group_offs[UVS_LAYER_VIEWPORT]));
+}
+
+static bool
+gather_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct agx_unlinked_uvs_layout *layout = data;
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   unsigned component = nir_intrinsic_component(intr);
+
+   if (nir_src_is_const(intr->src[1])) {
+      unsigned loc = sem.location + nir_src_as_uint(intr->src[1]);
+      layout->components[loc] = MAX2(layout->components[loc], component + 1);
+   } else {
+      for (unsigned i = 0; i < sem.num_slots; ++i) {
+         layout->components[sem.location + i] = 4;
+      }
+   }
+
+   return false;
+}
+
+bool
+agx_nir_lower_uvs(nir_shader *s, struct agx_unlinked_uvs_layout *layout)
+{
+   bool progress = false;
+
+   /* Scalarize up front so we can ignore vectors later */
+   NIR_PASS(progress, s, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
+            NULL);
+
+   /* Determine the unlinked UVS layout */
+   NIR_PASS(progress, s, nir_shader_intrinsics_pass, gather_components,
+            nir_metadata_block_index | nir_metadata_dominance, layout);
+
+   unsigned sizes[UVS_NUM_GROUP] = {
+      [UVS_POSITION] = 4,
+      [UVS_PSIZ] = !!(s->info.outputs_written & VARYING_BIT_PSIZ),
+      [UVS_LAYER_VIEWPORT] = !!(s->info.outputs_written &
+                                (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)),
+      [UVS_CLIP_DIST] = s->info.clip_distance_array_size,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(layout->components); ++i) {
+      if (i != VARYING_SLOT_POS && i != VARYING_SLOT_PSIZ &&
+          layout->components[i]) {
+
+         layout->written |= BITFIELD64_BIT(i);
+         sizes[UVS_VARYINGS] += layout->components[i];
+      }
+   }
+
+   unsigned offs = 0;
+   for (enum uvs_group g = 0; g < UVS_NUM_GROUP; ++g) {
+      layout->group_offs[g] = offs;
+      offs += sizes[g];
+   }
+
+   layout->size = offs;
+   layout->user_size = sizes[UVS_VARYINGS];
+
+   /* Now lower in terms of the unlinked layout */
+   struct ctx ctx = {.layout = layout};
+   NIR_PASS(progress, s, nir_shader_intrinsics_pass, lower,
+            nir_metadata_block_index | nir_metadata_dominance, &ctx);
+
+   if (ctx.layer || ctx.viewport) {
+      write_layer_viewport_sysval(&ctx);
+   }
+
+   /* Finally, pack what we can. It's much cheaper to do this at compile-time
+    * than draw-time.
+    */
+   agx_pack(&layout->osel, OUTPUT_SELECT, cfg) {
+      cfg.point_size = sizes[UVS_PSIZ];
+      cfg.viewport_target = sizes[UVS_LAYER_VIEWPORT];
+      cfg.render_target = cfg.viewport_target;
+
+      cfg.clip_distance_plane_0 = sizes[UVS_CLIP_DIST] > 0;
+      cfg.clip_distance_plane_1 = sizes[UVS_CLIP_DIST] > 1;
+      cfg.clip_distance_plane_2 = sizes[UVS_CLIP_DIST] > 2;
+      cfg.clip_distance_plane_3 = sizes[UVS_CLIP_DIST] > 3;
+      cfg.clip_distance_plane_4 = sizes[UVS_CLIP_DIST] > 4;
+      cfg.clip_distance_plane_5 = sizes[UVS_CLIP_DIST] > 5;
+      cfg.clip_distance_plane_6 = sizes[UVS_CLIP_DIST] > 6;
+      cfg.clip_distance_plane_7 = sizes[UVS_CLIP_DIST] > 7;
+   }
+
+   agx_pack(&layout->vdm, VDM_STATE_VERTEX_OUTPUTS, cfg) {
+      cfg.output_count_1 = offs;
+      cfg.output_count_2 = offs;
+   }
+
+   return progress;
+}
+
+void
+agx_assign_uvs(struct agx_varyings_vs *varyings,
+               struct agx_unlinked_uvs_layout *layout, uint64_t flat_mask,
+               uint64_t linear_mask)
+{
+   *varyings = (struct agx_varyings_vs){0};
+
+   /* These are always flat-shaded from the FS perspective */
+   flat_mask |= VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
+
+   /* The internal cull distance slots are always linearly-interpolated */
+   linear_mask |= BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE, 2);
+
+   assert(!(flat_mask & linear_mask));
+
+   /* TODO: Link FP16 varyings */
+   unsigned num_32_smooth = 0, num_32_flat = 0, num_32_linear = 0;
+   struct {
+      uint32_t *num;
+      uint64_t mask;
+   } parts[] = {
+      {&num_32_smooth, ~flat_mask & ~linear_mask},
+      {&num_32_flat, flat_mask},
+      {&num_32_linear, linear_mask},
+   };
+
+   unsigned base = layout->group_offs[UVS_VARYINGS];
+
+   for (unsigned p = 0; p < ARRAY_SIZE(parts); ++p) {
+      u_foreach_bit64(loc, parts[p].mask & layout->written) {
+         assert(loc < ARRAY_SIZE(varyings->slots));
+         varyings->slots[loc] = base;
+
+         base += layout->components[loc];
+         (*parts[p].num) += layout->components[loc];
+      }
+   }
+
+   agx_pack(&varyings->counts_32, VARYING_COUNTS, cfg) {
+      cfg.smooth = num_32_smooth;
+      cfg.flat = num_32_flat;
+      cfg.linear = num_32_linear;
+   }
+
+   agx_pack(&varyings->counts_16, VARYING_COUNTS, cfg) {
+      cfg.smooth = 0;
+      cfg.flat = 0;
+      cfg.linear = 0;
+   }
+}
diff --git a/src/asahi/lib/agx_nir_passes.h b/src/asahi/lib/agx_nir_passes.h
index daa2a4929e8..1d2d4b9edaf 100644
--- a/src/asahi/lib/agx_nir_passes.h
+++ b/src/asahi/lib/agx_nir_passes.h
@@ -6,6 +6,9 @@
 #pragma once
 
 #include <stdbool.h>
+#include <stdint.h>
+#include "agx_pack.h"
+#include "shader_enums.h"
 
 struct nir_shader;
 struct nir_instr;
diff --git a/src/asahi/lib/agx_uvs.h b/src/asahi/lib/agx_uvs.h
new file mode 100644
index 00000000000..424db3661bf
--- /dev/null
+++ b/src/asahi/lib/agx_uvs.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "agx_pack.h"
+#include "shader_enums.h"
+
+struct nir_shader;
+
+/* Matches the hardware order */
+enum uvs_group {
+   UVS_POSITION,
+   UVS_VARYINGS,
+   UVS_PSIZ,
+   UVS_LAYER_VIEWPORT,
+   UVS_CLIP_DIST,
+   UVS_NUM_GROUP,
+};
+
+/**
+ * Represents an "unlinked" UVS layout. This is computable from an unlinked
+ * vertex shader without knowing the associated fragment shader. The various UVS
+ * groups have fixed offsets, but the varyings within the varying group have
+ * indeterminate order since we don't yet know the fragment shader interpolation
+ * qualifiers.
+ */
+struct agx_unlinked_uvs_layout {
+   /* Offset of each group in the UVS in words. */
+   uint8_t group_offs[UVS_NUM_GROUP];
+
+   /* Size of the UVS allocation in words. >= last group_offs element */
+   uint8_t size;
+
+   /* Size of the UVS_VARYINGS */
+   uint8_t user_size;
+
+   /* Number of 32-bit components written for each slot. TODO: Model 16-bit.
+    *
+    * Invariant: sum_{slot} (components[slot]) =
+    *            group_offs[PSIZ] - group_offs[VARYINGS]
+    */
+   uint8_t components[VARYING_SLOT_MAX];
+
+   /* Bit i set <===> components[i] != 0 && i != POS && i != PSIZ. For fast
+    * iteration of user varyings.
+    */
+   uint64_t written;
+
+   /* Fully packed data structure */
+   struct agx_vdm_state_vertex_outputs_packed vdm;
+
+   /* Partial data structure, must be merged with FS selects */
+   struct agx_output_select_packed osel;
+};
+
+bool agx_nir_lower_uvs(struct nir_shader *s,
+                       struct agx_unlinked_uvs_layout *layout);
+
+/**
+ * Represents a linked UVS layout.
+ */
+struct agx_varyings_vs {
+   /* Associated linked hardware data structures */
+   struct agx_varying_counts_packed counts_32, counts_16;
+
+   /* If the user varying slot is written, this is the base index that the first
+    * component of the slot is written to. The next components are found in the
+    * next indices. Otherwise 0, aliasing position.
+    */
+   unsigned slots[VARYING_SLOT_MAX];
+};
+
+void agx_assign_uvs(struct agx_varyings_vs *varyings,
+                    struct agx_unlinked_uvs_layout *layout, uint64_t flat_mask,
+                    uint64_t linear_mask);
diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build
index 3bf9590e415..047608b1c48 100644
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -20,6 +20,7 @@ libasahi_lib_files = files(
   'agx_nir_lower_tess.c',
   'agx_nir_lower_texture.c',
   'agx_nir_lower_tilebuffer.c',
+  'agx_nir_lower_uvs.c',
   'agx_nir_lower_vbo.c',
   'agx_nir_predicate_layer_id.c',
   'agx_ppp.h',
diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c
index b396b5d957e..235ba1cfd74 100644
--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@@ -63,6 +63,7 @@ write_shader(struct blob *blob, const struct agx_compiled_shader *binary,
    blob_write_uint32(blob, shader_size);
    blob_write_bytes(blob, binary->bo->ptr.cpu, shader_size);
    blob_write_bytes(blob, &binary->info, sizeof(binary->info));
+   blob_write_bytes(blob, &binary->uvs, sizeof(binary->uvs));
    blob_write_uint32(blob, binary->push_range_count);
    blob_write_bytes(blob, binary->push,
                     sizeof(binary->push[0]) * binary->push_range_count);
@@ -96,6 +97,7 @@ read_shader(struct agx_screen *screen, struct blob_reader *blob,
    blob_copy_bytes(blob, binary->bo->ptr.cpu, binary_size);
 
    blob_copy_bytes(blob, &binary->info, sizeof(binary->info));
+   blob_copy_bytes(blob, &binary->uvs, sizeof(binary->uvs));
    binary->push_range_count = blob_read_uint32(blob);
    blob_copy_bytes(blob, binary->push,
                    sizeof(binary->push[0]) * binary->push_range_count);
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index 7f3147a4fb3..9c2bd4ef955 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -189,6 +189,9 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
       return load_sysval_root(b, 1, 16, &u->sprite_mask);
    case nir_intrinsic_load_clip_z_coeff_agx:
       return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
+   case nir_intrinsic_load_uvs_index_agx:
+      return load_sysval_root(
+         b, 1, 16, &u->uvs_index[nir_intrinsic_io_semantics(intr).location]);
    case nir_intrinsic_load_polygon_stipple_agx: {
       nir_def *base = load_sysval_root(b, 1, 64, &u->polygon_stipple);
       nir_def *row = intr->src[0].ssa;
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 693ddfcbca9..4dd4ce59a47 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1521,59 +1521,10 @@ asahi_cs_shader_key_equal(const void *a, const void *b)
    return true;
 }
 
-static unsigned
-agx_find_linked_slot(struct agx_varyings_vs *vs, struct agx_varyings_fs *fs,
-                     gl_varying_slot slot, unsigned offset)
-{
-   assert(offset < 4);
-   assert(slot != VARYING_SLOT_PNTC && "point coords aren't linked");
-
-   if (slot == VARYING_SLOT_POS) {
-      if (offset == 3) {
-         return 0; /* W */
-      } else if (offset == 2) {
-         assert(fs->reads_z);
-         return 1; /* Z */
-      } else {
-         unreachable("gl_Position.xy are not varyings");
-      }
-   }
-
-   unsigned vs_index = vs->slots[slot];
-
-   /* Varyings not written by vertex shader are undefined but we can't crash */
-   if (!(vs_index < vs->nr_index))
-      return 0;
-
-   assert(vs_index >= 4 && "gl_Position should have been the first 4 slots");
-   assert((vs_index < vs->base_index_fp16) ==
-             ((vs_index + offset) < vs->base_index_fp16) &&
-          "a given varying must have a consistent type");
-
-   unsigned vs_user_index = (vs_index + offset) - 4;
-
-   if (fs->reads_z)
-      return vs_user_index + 2;
-   else
-      return vs_user_index + 1;
-}
-
-static unsigned
-agx_num_general_outputs(struct agx_varyings_vs *vs)
-{
-   unsigned nr_vs = vs->nr_index;
-   bool writes_psiz = vs->slots[VARYING_SLOT_PSIZ] < nr_vs;
-
-   assert(nr_vs >= 4 && "gl_Position must be written");
-   if (writes_psiz)
-      assert(nr_vs >= 5 && "gl_PointSize is written");
-
-   return nr_vs - (writes_psiz ? 5 : 4);
-}
-
 static uint32_t
 agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs,
-                        struct agx_varyings_fs *fs, bool first_provoking_vertex,
+                        unsigned nr_user_indices, struct agx_varyings_fs *fs,
+                        bool first_provoking_vertex,
                         uint8_t sprite_coord_enable,
                         bool *generate_primitive_id)
 {
@@ -1586,11 +1537,14 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs,
    size_t linkage_size =
       AGX_CF_BINDING_HEADER_LENGTH + (fs->nr_bindings * AGX_CF_BINDING_LENGTH);
 
-   void *tmp = alloca(linkage_size);
-   struct agx_cf_binding_header_packed *header = tmp;
+   struct agx_ptr t = agx_pool_alloc_aligned(pool, linkage_size, 256);
+   assert(t.gpu < (1ull << 32) && "varyings must be in low memory");
+
+   struct agx_cf_binding_header_packed *header = t.cpu;
    struct agx_cf_binding_packed *bindings = (void *)(header + 1);
 
-   unsigned nr_slots = agx_num_general_outputs(vs) + 1 + (fs->reads_z ? 1 : 0);
+   unsigned user_base = 1 + (fs->reads_z ? 1 : 0);
+   unsigned nr_slots = user_base + nr_user_indices;
 
    agx_pack(header, CF_BINDING_HEADER, cfg) {
       cfg.number_of_32_bit_slots = nr_slots;
@@ -1598,35 +1552,45 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs,
    }
 
    for (unsigned i = 0; i < fs->nr_bindings; ++i) {
+      struct agx_cf_binding b = fs->bindings[i];
+
       agx_pack(bindings + i, CF_BINDING, cfg) {
-         cfg.base_coefficient_register = fs->bindings[i].cf_base;
-         cfg.components = fs->bindings[i].count;
+         cfg.base_coefficient_register = b.cf_base;
+         cfg.components = b.count;
          cfg.shade_model =
             agx_translate_shade_model(fs, i, first_provoking_vertex);
 
-         if (util_varying_is_point_coord(fs->bindings[i].slot,
-                                         sprite_coord_enable)) {
-            assert(fs->bindings[i].offset == 0);
+         if (util_varying_is_point_coord(b.slot, sprite_coord_enable)) {
+            assert(b.offset == 0);
             cfg.source = AGX_COEFFICIENT_SOURCE_POINT_COORD;
-         } else if (fs->bindings[i].slot == VARYING_SLOT_PRIMITIVE_ID &&
-                    vs->slots[VARYING_SLOT_PRIMITIVE_ID] == ~0) {
+         } else if (b.slot == VARYING_SLOT_PRIMITIVE_ID &&
+                    !vs->slots[VARYING_SLOT_PRIMITIVE_ID]) {
             cfg.source = AGX_COEFFICIENT_SOURCE_PRIMITIVE_ID;
             *generate_primitive_id = true;
-         } else {
-            cfg.base_slot = agx_find_linked_slot(vs, fs, fs->bindings[i].slot,
-                                                 fs->bindings[i].offset);
+         } else if (b.slot == VARYING_SLOT_POS) {
+            assert(b.offset >= 2 && "gl_Position.xy are not varyings");
+            assert(fs->reads_z || b.offset != 2);
 
-            assert(cfg.base_slot + cfg.components <=
-                      MAX2(nr_slots, cfg.components) &&
-                   "overflow slots");
-         }
-
-         if (fs->bindings[i].slot == VARYING_SLOT_POS) {
-            if (fs->bindings[i].offset == 2) {
+            if (b.offset == 2) {
                cfg.source = AGX_COEFFICIENT_SOURCE_FRAGCOORD_Z;
+               cfg.base_slot = 1;
             } else {
-               assert(!fs->bindings[i].perspective &&
-                      "W must not be perspective divided");
+               assert(!b.perspective && "W must not be perspective divided");
+            }
+         } else {
+            unsigned vs_index = vs->slots[b.slot];
+            assert(b.offset < 4);
+
+            /* Varyings not written by vertex shader are undefined but we can't
+             * crash */
+            if (vs_index) {
+               assert(vs_index >= 4 &&
+                      "gl_Position should have been the first 4 slots");
+
+               cfg.base_slot = user_base + (vs_index - 4) + b.offset;
+
+               assert(cfg.base_slot + cfg.components <= nr_slots &&
+                      "overflow slots");
             }
          }
 
@@ -1635,16 +1599,7 @@ agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs,
       }
    }
 
-   struct agx_ptr ptr = agx_pool_alloc_aligned(pool, (3 * linkage_size), 256);
-   assert(ptr.gpu < (1ull << 32) && "varyings must be in low memory");
-
-   /* I don't understand why the data structures are repeated thrice */
-   for (unsigned i = 0; i < 3; ++i) {
-      memcpy(((uint8_t *)ptr.cpu) + (i * linkage_size), (uint8_t *)tmp,
-             linkage_size);
-   }
-
-   return ptr.gpu;
+   return t.gpu;
 }
 
 /* Dynamic lowered I/O version of nir_lower_clip_halfz */
@@ -1859,6 +1814,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
    perf_debug(dev, "Compiling shader variant #%u",
               _mesa_hash_table_num_entries(so->variants));
 
+   struct agx_unlinked_uvs_layout uvs = {0};
    bool force_translucent = false;
 
    if (nir->info.stage == MESA_SHADER_VERTEX) {
@@ -1871,6 +1827,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
                   key->next.hw.fixed_point_size);
          NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
                   nir_metadata_block_index | nir_metadata_dominance, NULL);
+         NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
       } else {
          NIR_PASS(_, nir, agx_nir_lower_sysvals, PIPE_SHADER_VERTEX, false);
          NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx,
@@ -1993,21 +1950,11 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
 
    struct agx_shader_key base_key = {0};
 
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      struct asahi_vs_shader_key *key = &key_->vs;
-
-      if (key->hw) {
-         base_key.vs.outputs_flat_shaded = key_->vs.next.hw.outputs_flat_shaded;
-
-         base_key.vs.outputs_linear_shaded =
-            key_->vs.next.hw.outputs_linear_shaded;
-      }
-   }
-
    struct agx_compiled_shader *compiled =
       agx_compile_nir(dev, nir, &base_key, debug, so->type);
 
    compiled->so = so;
+   compiled->uvs = uvs;
 
    /* reads_tib => Translucent pass type */
    compiled->info.reads_tib |= force_translucent;
@@ -2039,13 +1986,14 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
       NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
                nir_metadata_block_index | nir_metadata_dominance, NULL);
 
-      base_key.vs.outputs_flat_shaded = key->outputs_flat_shaded;
-      base_key.vs.outputs_linear_shaded = key->outputs_linear_shaded;
+      struct agx_unlinked_uvs_layout uvs = {0};
+      NIR_PASS(_, gs_copy, agx_nir_lower_uvs, &uvs);
 
       compiled->gs_copy =
          agx_compile_nir(dev, gs_copy, &base_key, debug, PIPE_SHADER_GEOMETRY);
       compiled->gs_copy->so = so;
       compiled->gs_copy->stage = so->type;
+      compiled->gs_copy->uvs = uvs;
    }
 
    compiled->gs_output_mode = gs_out_prim;
@@ -2427,10 +2375,9 @@ agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
     *
     * vb_mask, attributes, vertex_buffers: VERTEX
     * point_size_per_vertex: RS
-    * outputs_{flat,linear}_shaded: FS_PROG
     */
    if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB |
-                        AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)) ||
+                        AGX_DIRTY_RS | AGX_DIRTY_PRIM)) ||
          ctx->stage[PIPE_SHADER_TESS_EVAL].dirty ||
          ctx->stage[PIPE_SHADER_GEOMETRY].dirty ||
          ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
@@ -2451,11 +2398,6 @@ agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
        */
       key.next.hw.fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
                                      rasterized_prim == MESA_PRIM_POINTS;
-
-      key.next.hw.outputs_flat_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded;
-      key.next.hw.outputs_linear_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded;
    } else {
       key.next.sw.index_size_B = index_size_B;
    }
@@ -2511,10 +2453,6 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
       /* TODO: Deduplicate */
       .fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
                           rasterized_prim == MESA_PRIM_POINTS,
-      .outputs_flat_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
-      .outputs_linear_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded,
    };
 
    return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
@@ -3564,8 +3502,9 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out)
 
    if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) ||
        IS_DIRTY(PRIM)) {
+
       batch->varyings = agx_link_varyings_vs_fs(
-         &batch->pipeline_pool, &vs->info.varyings.vs,
+         &batch->pipeline_pool, &batch->linked_varyings, vs->uvs.user_size,
          &ctx->fs->info.varyings.fs, ctx->rast->base.flatshade_first,
          (batch->reduced_prim == MESA_PRIM_POINTS)
             ? ctx->rast->base.sprite_coord_enable
@@ -3596,10 +3535,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out)
          cfg.pipeline = agx_build_pipeline(batch, vs, PIPE_SHADER_VERTEX, 0, 0);
       }
 
-      agx_push(out, VDM_STATE_VERTEX_OUTPUTS, cfg) {
-         cfg.output_count_1 = vs->info.varyings.vs.nr_index;
-         cfg.output_count_2 = cfg.output_count_1;
-      }
+      agx_push_packed(out, vs->uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
 
       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
          cfg.flat_shading_control = ctx->rast->base.flatshade_first
@@ -3654,9 +3590,9 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out)
       .fragment_back_face = fragment_face_dirty,
       .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
       .fragment_back_stencil = IS_DIRTY(ZS),
-      .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG),
-      .varying_counts_32 = IS_DIRTY(VS_PROG),
-      .varying_counts_16 = IS_DIRTY(VS_PROG),
+      .output_select = varyings_dirty,
+      .varying_counts_32 = varyings_dirty,
+      .varying_counts_16 = varyings_dirty,
       .cull = IS_DIRTY(RS),
       .cull_2 = varyings_dirty,
       .fragment_shader =
@@ -3742,40 +3678,24 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out)
    if (dirty.fragment_back_stencil)
       agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL);
 
-   if (dirty.output_select) {
-      agx_ppp_push(&ppp, OUTPUT_SELECT, cfg) {
-         cfg.varyings = !!fs->info.varyings.fs.nr_bindings;
-         cfg.point_size = vs->info.writes_psiz;
-         cfg.viewport_target = vs->info.writes_layer_viewport;
-         cfg.render_target = vs->info.writes_layer_viewport;
-         cfg.frag_coord_z = fs->info.varyings.fs.reads_z;
-         cfg.clip_distance_plane_0 = vs->info.varyings.vs.nr_clip_dists > 0;
-         cfg.clip_distance_plane_1 = vs->info.varyings.vs.nr_clip_dists > 1;
-         cfg.clip_distance_plane_2 = vs->info.varyings.vs.nr_clip_dists > 2;
-         cfg.clip_distance_plane_3 = vs->info.varyings.vs.nr_clip_dists > 3;
-         cfg.clip_distance_plane_4 = vs->info.varyings.vs.nr_clip_dists > 4;
-         cfg.clip_distance_plane_5 = vs->info.varyings.vs.nr_clip_dists > 5;
-         cfg.clip_distance_plane_6 = vs->info.varyings.vs.nr_clip_dists > 6;
-         cfg.clip_distance_plane_7 = vs->info.varyings.vs.nr_clip_dists > 7;
-
-         assert(cfg.point_size || !is_points);
-      }
-   }
-
    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+   assert(dirty.varying_counts_32 == dirty.output_select);
 
-   if (dirty.varying_counts_32) {
-      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
-         cfg.smooth = vs->info.varyings.vs.num_32_smooth;
-         cfg.flat = vs->info.varyings.vs.num_32_flat;
-         cfg.linear = vs->info.varyings.vs.num_32_linear;
+   if (dirty.output_select) {
+      struct agx_output_select_packed osel;
+      agx_pack(&osel, OUTPUT_SELECT, cfg) {
+         cfg.varyings = !!fs->info.varyings.fs.nr_bindings;
+         cfg.frag_coord_z = fs->info.varyings.fs.reads_z;
       }
 
-      agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
-         cfg.smooth = vs->info.varyings.vs.num_16_smooth;
-         cfg.flat = vs->info.varyings.vs.num_16_flat;
-         cfg.linear = vs->info.varyings.vs.num_16_linear;
-      }
+      agx_merge(osel, vs->uvs.osel, OUTPUT_SELECT);
+      agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
+
+      agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_32,
+                          VARYING_COUNTS);
+
+      agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_16,
+                          VARYING_COUNTS);
    }
 
    if (dirty.cull)
@@ -3817,7 +3737,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out)
 
    if (dirty.output_size) {
       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg)
-         cfg.count = vs->info.varyings.vs.nr_index;
+         cfg.count = vs->uvs.size;
    }
 
    agx_ppp_fini(&out, &ppp);
@@ -5061,6 +4981,21 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       agx_batch_add_bo(batch, ctx->gs->gs_copy->bo);
    }
 
+   if (ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG)) {
+      struct agx_compiled_shader *vs = ctx->vs;
+      if (ctx->gs)
+         vs = ctx->gs->gs_copy;
+
+      agx_assign_uvs(
+         &batch->linked_varyings, &vs->uvs,
+         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
+         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded);
+
+      for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
+         batch->uniforms.uvs_index[i] = batch->linked_varyings.slots[i];
+      }
+   }
+
    /* Set draw ID */
    if (ctx->vs->info.uses_draw_id) {
       batch->uniforms.draw_id = drawid_offset;
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 8c8a56f931f..c5335c769cf 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -14,6 +14,7 @@
 #include "asahi/lib/agx_nir_lower_vbo.h"
 #include "asahi/lib/agx_scratch.h"
 #include "asahi/lib/agx_tilebuffer.h"
+#include "asahi/lib/agx_uvs.h"
 #include "asahi/lib/pool.h"
 #include "asahi/lib/shaders/geometry.h"
 #include "compiler/nir/nir_lower_blend.h"
@@ -29,6 +30,7 @@
 #include "util/u_range.h"
 #include "agx_helpers.h"
 #include "agx_meta.h"
+#include "agx_nir_passes.h"
 
 #ifdef __GLIBC__
 #include <errno.h>
@@ -162,6 +164,11 @@ struct PACKED agx_draw_uniforms {
 
    /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
    uint16_t clip_z_coeff;
+
+   /* Mapping from varying slots written by the last vertex stage to UVS
+    * indices. This mapping must be compatible with the fragment shader.
+    */
+   uint16_t uvs_index[VARYING_SLOT_MAX];
 };
 
 struct PACKED agx_stage_uniforms {
@@ -221,6 +228,9 @@ struct agx_compiled_shader {
    unsigned push_range_count;
    struct agx_push_range push[AGX_MAX_PUSH_RANGES];
 
+   /* UVS layout for the last vertex stage */
+   struct agx_unlinked_uvs_layout uvs;
+
    /* Auxiliary programs, or NULL if not used */
    struct agx_compiled_shader *gs_count, *pre_gs;
    struct agx_compiled_shader *gs_copy;
@@ -366,6 +376,7 @@ struct agx_batch {
 
    /* Current varyings linkage structures */
    uint32_t varyings;
+   struct agx_varyings_vs linked_varyings;
 
    struct agx_draw_uniforms uniforms;
    struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES];
@@ -478,8 +489,6 @@ struct asahi_vs_shader_key {
 
       struct {
          bool fixed_point_size;
-         uint64_t outputs_flat_shaded;
-         uint64_t outputs_linear_shaded;
       } hw;
    } next;
 };
@@ -512,15 +521,13 @@ struct asahi_fs_shader_key {
 
 struct asahi_gs_shader_key {
    /* Rasterizer shader key */
-   uint64_t outputs_flat_shaded;
-   uint64_t outputs_linear_shaded;
    bool fixed_point_size;
 
    /* If true, this GS is run only for its side effects (including XFB) */
    bool rasterizer_discard;
    bool padding[6];
 };
-static_assert(sizeof(struct asahi_gs_shader_key) == 24, "no holes");
+static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
 
 union asahi_shader_key {
    struct asahi_vs_shader_key vs;