asahi: switch to VS/FS prolog/epilog system

With the exception of some variants for framebuffer fetch (to be addressed in a follow up MR, this is big enough as it is) -- this switches us to a shader precompile path for VS & FS. VS prologs let us implement vertex buffer fetch with dynamic formats, FS prologs let us implement misc emulation like API sample masking and cull distance, while FS epilogs handle blending and tilebuffer stores. This should cut down shader recompile jank significantly in the GL driver. It also prepares us with most of what we need for big ticket Vulkan extensions like ESO, GPL, and EDS3. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
2026-04-30 23:28:06 +02:00 · 2024-03-20 10:34:48 -04:00 · 2024-03-20 10:34:48 -04:00 · fcf1a8062b
commit fcf1a8062b
parent 742a842811
21 changed files with 1598 additions and 643 deletions
--- a/src/asahi/clc/asahi_clc.c
+++ b/src/asahi/clc/asahi_clc.c
@ -511,9 +511,6 @@ main(int argc, char **argv)
   nir_shader *nir = compile(mem_ctx, final_spirv.data, final_spirv.size);

   {
-      struct util_dynarray binary;
-      util_dynarray_init(&binary, NULL);
-
      nir_builder b = nir_builder_init_simple_shader(
         MESA_SHADER_COMPUTE, &agx_nir_options, "Helper shader");

@ -522,17 +519,18 @@ main(int argc, char **argv)

      nir_call(&b, nir_function_clone(b.shader, func));

-      UNUSED struct agx_shader_info compiled_info;
+      struct agx_shader_part compiled;
      struct agx_shader_key key = {
         .libagx = nir,
         .is_helper = true,
      };

      agx_preprocess_nir(b.shader, nir);
-      agx_compile_shader_nir(b.shader, &key, NULL, &binary, &compiled_info);
+      agx_compile_shader_nir(b.shader, &key, NULL, &compiled);

-      print_u32_data(fp, "libagx_g13", "helper", binary.data, binary.size);
-      util_dynarray_fini(&binary);
+      print_u32_data(fp, "libagx_g13", "helper", compiled.binary,
+                     compiled.binary_size);
+      free(compiled.binary);
      ralloc_free(b.shader);

      /* Remove the NIR function, it's compiled, we don't need it at runtime */
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@ -647,7 +647,6 @@ agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
   assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
   agx_wait_pix(b, 0x0008);
   b->shader->did_writeout = true;
-   b->shader->out->reads_tib = true;

   unsigned nr_comps = instr->def.num_components;
   agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]),
@ -2620,7 +2619,7 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
      } while (progress);
   }

-   if (likely(!(agx_compiler_debug & AGX_DBG_NOPREAMBLE)))
+   if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE)))
      NIR_PASS(_, nir, agx_nir_opt_preamble, preamble_size);

   /* Forming preambles may dramatically reduce the instruction count
@ -2845,6 +2844,10 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
   ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
   list_inithead(&ctx->blocks);

+   if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble) {
+      ctx->any_cf = key->fs.inside_sample_loop;
+   }
+
   ctx->alloc = impl->ssa_alloc;
   emit_cf_list(ctx, &impl->body);
   agx_emit_phis_deferred(ctx);
@ -2860,9 +2863,11 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
   /* Stop the main shader or preamble shader after the exit block. For real
    * functions, we would return here.
    */
-   agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
-   agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
-   agx_stop(&_b);
+   if (!ctx->key->no_stop || ctx->is_preamble) {
+      agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
+      agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
+      agx_stop(&_b);
+   }

   /* Index blocks now that we're done emitting so the order is consistent */
   agx_foreach_block(ctx, block)
@ -2884,7 +2889,8 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
      agx_opt_compact_constants(ctx);

      /* After inlining constants, promote what's left */
-      if (key->promote_constants && !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
+      if (key->promote_constants && !key->secondary &&
+          !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
         agx_opt_promote_constants(ctx);
      }
   }
@ -2930,7 +2936,8 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
         out->scratch_size = stack_size;
   }

-   if (ctx->stage == MESA_SHADER_VERTEX && !impl->function->is_preamble)
+   if (ctx->stage == MESA_SHADER_VERTEX && !impl->function->is_preamble &&
+       !ctx->key->secondary)
      agx_set_st_vary_final(ctx);

   agx_insert_waits(ctx);
@ -3083,10 +3090,13 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
 void
 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
                       struct util_debug_callback *debug,
-                       struct util_dynarray *binary,
-                       struct agx_shader_info *out)
+                       struct agx_shader_part *out)
 {
   agx_compiler_debug = agx_get_compiler_debug();
+   struct agx_shader_info *info = &out->info;
+
+   struct util_dynarray binary;
+   util_dynarray_init(&binary, NULL);

   memset(out, 0, sizeof *out);

@ -3096,7 +3106,7 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,

   /* If required, tag writes will be enabled by instruction selection */
   if (nir->info.stage == MESA_SHADER_FRAGMENT)
-      out->tag_write_disable = !nir->info.writes_memory;
+      info->tag_write_disable = !nir->info.writes_memory;

   bool needs_libagx = true /* TODO: Optimize */;

@ -3148,50 +3158,53 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
   NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
            nir_metadata_block_index | nir_metadata_dominance, NULL);

-   out->push_count = key->reserved_preamble;
-   agx_optimize_nir(nir, &out->push_count);
+   info->push_count = key->reserved_preamble;
+   agx_optimize_nir(nir, key->secondary ? NULL : &info->push_count);

-   if (nir->info.stage == MESA_SHADER_FRAGMENT)
-      assign_coefficient_regs(nir, &out->varyings.fs);
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      info->varyings.fs.nr_cf = key->fs.cf_base;
+      assign_coefficient_regs(nir, &info->varyings.fs);
+   }

   if (agx_should_dump(nir, AGX_DBG_SHADERS))
      nir_print_shader(nir, stdout);

-   out->local_size = nir->info.shared_size;
+   info->local_size = nir->info.shared_size;

   nir_foreach_function_with_impl(func, impl, nir) {
      unsigned offset =
-         agx_compile_function_nir(nir, impl, key, debug, binary, out);
+         agx_compile_function_nir(nir, impl, key, debug, &binary, &out->info);

      if (func->is_preamble) {
-         out->preamble_offset = offset;
-         out->has_preamble = true;
+         info->preamble_offset = offset;
+         info->has_preamble = true;
      } else if (func->is_entrypoint) {
-         out->main_offset = offset;
+         info->main_offset = offset;
+         info->main_size = binary.size - offset;
      } else {
         unreachable("General functions not yet supported");
      }
   }

   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      out->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
+      info->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;

-      out->writes_layer_viewport =
+      info->writes_layer_viewport =
         nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);

-      out->uses_draw_id =
+      info->uses_draw_id =
         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);

-      out->uses_base_param =
+      info->uses_base_param =
         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
         BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      out->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
-                                 nir->info.fs.needs_quad_helper_invocations ||
-                                 nir->info.writes_memory;
+      info->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
+                                  nir->info.fs.needs_quad_helper_invocations ||
+                                  nir->info.writes_memory;

      /* Writing the sample mask requires tag writes */
-      out->tag_write_disable &= !out->writes_sample_mask;
+      info->tag_write_disable &= !info->writes_sample_mask;

      /* Report a canonical depth layout. This happens at the end because the
       * sample mask lowering affects it.
@ -3199,10 +3212,15 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
      enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;

      if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
-         out->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
+         info->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
      else if (layout == FRAG_DEPTH_LAYOUT_NONE)
-         out->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
+         info->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
      else
-         out->depth_layout = layout;
+         info->depth_layout = layout;
+
+      info->reads_tib = nir->info.fs.uses_fbfetch_output;
   }
+
+   out->binary = binary.data;
+   out->binary_size = binary.size;
 }
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@ -164,6 +164,16 @@ struct agx_fs_shader_key {
    * tilebuffer loads (including blending).
    */
   bool ignore_tib_dependencies;
+
+   /* When dynamic sample shading is used, the fragment shader is wrapped in a
+    * loop external to the API shader. This bit indicates that we are compiling
+    * inside the sample loop, meaning the execution nesting counter is already
+    * zero and must be preserved.
+    */
+   bool inside_sample_loop;
+
+   /* Base coefficient register. 0 for API shaders but nonzero for FS prolog */
+   uint8_t cf_base;
 };

 struct agx_shader_key {
@ -191,6 +201,18 @@ struct agx_shader_key {
    */
   bool promote_constants;

+   /* Set if this is a non-monolithic shader that must be linked with additional
+    * shader parts before the program can be used. This suppresses omission of
+    * `stop` instructions, which the linker must insert instead.
+    */
+   bool no_stop;
+
+   /* Set if this is a secondary shader part (prolog or epilog). This prevents
+    * the compiler from allocating uniform registers. For example, this turns
+    * off preambles.
+    */
+   bool secondary;
+
   union {
      struct agx_fs_shader_key fs;
   };
@ -208,8 +230,7 @@ bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,

 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
                            struct util_debug_callback *debug,
-                            struct util_dynarray *binary,
-                            struct agx_shader_info *out);
+                            struct agx_shader_part *out);

 struct agx_occupancy {
   unsigned max_registers;
--- a/src/asahi/compiler/agx_nir_lower_sample_mask.c
+++ b/src/asahi/compiler/agx_nir_lower_sample_mask.c
@ -123,14 +123,26 @@ run_tests_after_last_discard(nir_builder *b)
         nir_intrinsic_instr *intr = last_discard_in_block(block);

         if (intr) {
-            /* Last discard is executed unconditionally, so fuse tests. */
+            /* Last discard is executed unconditionally, so fuse tests:
+             *
+             *    sample_mask (testing | killed), ~killed
+             *
+             * When testing, this is `sample_mask ~0, ~killed` which kills the
+             * kill set and triggers tests on the rest.
+             *
+             * When not testing, this is `sample_mask killed, ~killed` which is
+             * equivalent to `sample_mask killed, 0`, killing without testing.
+             */
            b->cursor = nir_before_instr(&intr->instr);

            nir_def *all_samples = nir_imm_intN_t(b, ALL_SAMPLES, 16);
            nir_def *killed = intr->src[0].ssa;
            nir_def *live = nir_ixor(b, killed, all_samples);

-            nir_sample_mask_agx(b, all_samples, live);
+            nir_def *testing = nir_load_shader_part_tests_zs_agx(b);
+            nir_def *affected = nir_ior(b, testing, killed);
+
+            nir_sample_mask_agx(b, affected, live);
            nir_instr_remove(&intr->instr);
            return;
         } else {
@ -142,8 +154,11 @@ run_tests_after_last_discard(nir_builder *b)
      } else if (cf_node_contains_discard(node)) {
         /* Conditionally executed block contains the last discard. Test
          * depth/stencil for remaining samples in unconditional code after.
+          *
+          * If we're not testing, this turns into sample_mask(0, ~0) which is a
+          * no-op.
          */
-         nir_sample_mask_agx(b, nir_imm_intN_t(b, ALL_SAMPLES, 16),
+         nir_sample_mask_agx(b, nir_load_shader_part_tests_zs_agx(b),
                             nir_imm_intN_t(b, ALL_SAMPLES, 16));
         return;
      }
@ -163,6 +178,8 @@ run_tests_at_start(nir_shader *shader)
 bool
 agx_nir_lower_sample_mask(nir_shader *shader)
 {
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
   bool writes_zs =
      shader->info.outputs_written &
      (BITFIELD64_BIT(FRAG_RESULT_STENCIL) | BITFIELD64_BIT(FRAG_RESULT_DEPTH));
@ -179,7 +196,6 @@ agx_nir_lower_sample_mask(nir_shader *shader)
       * we need to trigger tests explicitly. Allow sample_mask with zs_emit.
       */
      if (!writes_zs) {
-         nir_function_impl *impl = nir_shader_get_entrypoint(shader);
         nir_builder b = nir_builder_create(impl);

         /* run tests late */
@ -187,9 +203,13 @@ agx_nir_lower_sample_mask(nir_shader *shader)
      }
   } else {
      /* regular shaders that don't use discard have nothing to lower */
+      nir_metadata_preserve(impl, nir_metadata_all);
      return false;
   }

+   nir_metadata_preserve(impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+
   nir_shader_intrinsics_pass(shader, lower_discard_to_sample_mask_0,
                              nir_metadata_block_index | nir_metadata_dominance,
                              NULL);
--- a/src/asahi/compiler/agx_pack.c
+++ b/src/asahi/compiler/agx_pack.c
@ -1144,11 +1144,13 @@ agx_pack_binary(agx_context *ctx, struct util_dynarray *emission)
   util_dynarray_foreach(&fixups, struct agx_branch_fixup, fixup)
      agx_fixup_branch(emission, *fixup);

-   /* Dougall calls the instruction in this footer "trap". Match the blob. */
-   for (unsigned i = 0; i < 8; ++i) {
-      uint16_t trap = agx_opcodes_info[AGX_OPCODE_TRAP].encoding.exact;
-      util_dynarray_append(emission, uint16_t, trap);
-   }
-
   util_dynarray_fini(&fixups);
+
+   /* Dougall calls the instruction in this footer "trap". Match the blob. */
+   if (!ctx->key->no_stop || ctx->is_preamble) {
+      for (unsigned i = 0; i < 8; ++i) {
+         uint16_t trap = agx_opcodes_info[AGX_OPCODE_TRAP].encoding.exact;
+         util_dynarray_append(emission, uint16_t, trap);
+      }
+   }
 }
--- a/src/asahi/lib/agx_helpers.h
+++ b/src/asahi/lib/agx_helpers.h
@ -94,19 +94,6 @@ agx_translate_index_size(uint8_t size_B)
   return __builtin_ctz(size_B);
 }

-static enum agx_pass_type
-agx_pass_type_for_shader(struct agx_shader_info *info)
-{
-   if (info->reads_tib && info->writes_sample_mask)
-      return AGX_PASS_TYPE_TRANSLUCENT_PUNCH_THROUGH;
-   else if (info->reads_tib)
-      return AGX_PASS_TYPE_TRANSLUCENT;
-   else if (info->writes_sample_mask)
-      return AGX_PASS_TYPE_PUNCH_THROUGH;
-   else
-      return AGX_PASS_TYPE_OPAQUE;
-}
-
 static enum agx_conservative_depth
 agx_translate_depth_layout(enum gl_frag_depth_layout layout)
 {
--- a/src/asahi/lib/agx_linker.h
+++ b/src/asahi/lib/agx_linker.h
@ -7,7 +7,9 @@

 #include "agx_bo.h"
 #include "agx_compile.h"
+#include "agx_nir_lower_vbo.h"
 #include "agx_pack.h"
+#include "nir_lower_blend.h"

 struct agx_linked_shader {
   /* Mapped executable memory */
@ -34,3 +36,111 @@ struct agx_linked_shader *
 agx_fast_link(void *memctx, struct agx_device *dev, bool fragment,
              struct agx_shader_part *main, struct agx_shader_part *prolog,
              struct agx_shader_part *epilog, unsigned nr_samples_shaded);
+
+/* These parts of the vertex element affect the generated code */
+struct agx_velem_key {
+   uint32_t divisor;
+   uint16_t stride;
+   uint8_t format;
+   uint8_t pad;
+};
+
+struct agx_vs_prolog_key {
+   struct agx_velem_key attribs[AGX_MAX_VBUFS];
+
+   /* Bit mask of attribute components to load */
+   BITSET_DECLARE(component_mask, VERT_ATTRIB_MAX * 4);
+
+   /* Whether running as a hardware vertex shader (versus compute) */
+   bool hw;
+
+   /* If !hw and the draw call is indexed, the index size */
+   uint8_t sw_index_size_B;
+};
+
+struct agx_fs_prolog_key {
+   /* glSampleMask() mask */
+   uint8_t api_sample_mask;
+
+   /* Number of cull planes requiring lowering */
+   uint8_t cull_distance_size;
+
+   /* Need to count FRAGMENT_SHADER_INVOCATIONS */
+   bool statistics;
+
+   /* Need to lower desktop OpenGL polygon stipple */
+   bool polygon_stipple;
+
+   /* If we discard, whether we need to run Z/S tests */
+   bool run_zs_tests;
+
+   /* If we emulate cull distance, the base offset for our allocated coefficient
+    * registers so we don't interfere with the main shader.
+    */
+   unsigned cf_base;
+};
+
+struct agx_blend_key {
+   nir_lower_blend_rt rt[8];
+   unsigned logicop_func;
+   bool alpha_to_coverage, alpha_to_one;
+   bool padding[2];
+};
+static_assert(sizeof(struct agx_blend_key) == 232, "packed");
+
+struct agx_fs_epilog_link_info {
+   /* Base index of spilled render targets in the binding table */
+   uint8_t rt_spill_base;
+
+   /* Bit mask of the bit size written to each render target. Bit i set if RT i
+    * uses 32-bit registers, else 16-bit registers.
+    */
+   uint8_t size_32;
+
+   /* If set, the API fragment shader uses sample shading. This means the epilog
+    * will be invoked per-sample as well.
+    */
+   bool sample_shading;
+
+   /* If set, broadcast the render target #0 value to all render targets. This
+    * implements gl_FragColor semantics.
+    */
+   bool broadcast_rt0;
+
+   /* If set, force render target 0's W channel to 1.0. This optimizes blending
+    * calculations in some applications.
+    */
+   bool rt0_w_1;
+
+   /* If set, the API fragment shader wants to write depth/stencil respectively.
+    * This happens in the epilog for correctness when the epilog discards.
+    */
+   bool write_z, write_s;
+};
+
+struct agx_fs_epilog_key {
+   /* Mask of render targets written by the main shader */
+   uint8_t rt_written;
+
+   struct agx_fs_epilog_link_info link;
+
+   /* Blend state. Blending happens in the epilog. */
+   struct agx_blend_key blend;
+
+   /* Tilebuffer configuration */
+   enum pipe_format rt_formats[8];
+   uint8_t nr_samples;
+   bool force_small_tile;
+};
+
+void agx_nir_vs_prolog(struct nir_builder *b, const void *key_);
+void agx_nir_fs_epilog(struct nir_builder *b, const void *key_);
+void agx_nir_fs_prolog(struct nir_builder *b, const void *key_);
+
+bool agx_nir_lower_vs_input_to_prolog(nir_shader *s,
+                                      BITSET_WORD *attrib_components_read);
+
+bool agx_nir_lower_fs_output_to_epilog(nir_shader *s,
+                                       struct agx_fs_epilog_link_info *out);
+
+bool agx_nir_lower_fs_active_samples_to_register(nir_shader *s);
--- a/src/asahi/lib/agx_meta.c
+++ b/src/asahi/lib/agx_meta.c
@ -4,6 +4,7 @@
 */

 #include "agx_meta.h"
+#include "agx_compile.h"
 #include "agx_device.h" /* for AGX_MEMORY_TYPE_SHADER */
 #include "agx_nir_passes.h"
 #include "agx_tilebuffer.h"
@ -30,16 +31,12 @@ agx_compile_meta_shader(struct agx_meta_cache *cache, nir_shader *shader,
                        struct agx_shader_key *key,
                        struct agx_tilebuffer_layout *tib)
 {
-   struct util_dynarray binary;
-   util_dynarray_init(&binary, NULL);
-
   agx_nir_lower_texture(shader);
   agx_preprocess_nir(shader, cache->dev->libagx);
   if (tib) {
      unsigned bindless_base = 0;
      agx_nir_lower_tilebuffer(shader, tib, NULL, &bindless_base, NULL);
-      agx_nir_lower_monolithic_msaa(
-         shader, &(struct agx_msaa_state){.nr_samples = tib->nr_samples});
+      agx_nir_lower_monolithic_msaa(shader, tib->nr_samples);
      agx_nir_lower_multisampled_image_store(shader);

      nir_shader_intrinsics_pass(
@ -50,11 +47,13 @@ agx_compile_meta_shader(struct agx_meta_cache *cache, nir_shader *shader,
   key->libagx = cache->dev->libagx;

   struct agx_meta_shader *res = rzalloc(cache->ht, struct agx_meta_shader);
-   agx_compile_shader_nir(shader, key, NULL, &binary, &res->info);
+   struct agx_shader_part bin;
+   agx_compile_shader_nir(shader, key, NULL, &bin);

-   res->ptr = agx_pool_upload_aligned_with_bo(&cache->pool, binary.data,
-                                              binary.size, 128, &res->bo);
-   util_dynarray_fini(&binary);
+   res->info = bin.info;
+   res->ptr = agx_pool_upload_aligned_with_bo(&cache->pool, bin.binary,
+                                              bin.binary_size, 128, &res->bo);
+   free(bin.binary);
   ralloc_free(shader);

   return res;
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@ -310,7 +310,7 @@ lower_gs_count_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }

 static bool
-lower_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+lower_prolog_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
   b->cursor = nir_before_instr(&intr->instr);

@ -319,6 +319,34 @@ lower_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
      id = load_primitive_id(b);
   else if (intr->intrinsic == nir_intrinsic_load_instance_id)
      id = load_instance_id(b);
+   else
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_def_rewrite_uses(&intr->def, id);
+   return true;
+}
+
+bool
+agx_nir_lower_sw_vs_id(nir_shader *s)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_prolog_id, nir_metadata_dominance | nir_metadata_block_index,
+      NULL);
+}
+
+static bool
+lower_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   bool *lower_instance = data;
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_def *id;
+   if (intr->intrinsic == nir_intrinsic_load_primitive_id)
+      id = load_primitive_id(b);
+   else if (intr->intrinsic == nir_intrinsic_load_instance_id &&
+            *lower_instance)
+      id = load_instance_id(b);
   else if (intr->intrinsic == nir_intrinsic_load_num_vertices)
      id = nir_channel(b, nir_load_num_workgroups(b), 0);
   else if (intr->intrinsic == nir_intrinsic_load_flat_mask)
@ -360,8 +388,9 @@ agx_nir_create_geometry_count_shader(nir_shader *gs, const nir_shader *libagx,
   NIR_PASS(_, shader, nir_shader_intrinsics_pass, lower_gs_count_instr,
            nir_metadata_block_index | nir_metadata_dominance, state);

+   bool lower_instance = true;
   NIR_PASS(_, shader, nir_shader_intrinsics_pass, lower_id,
-            nir_metadata_block_index | nir_metadata_dominance, NULL);
+            nir_metadata_block_index | nir_metadata_dominance, &lower_instance);

   agx_preprocess_nir(shader, libagx);
   return shader;
@ -427,9 +456,11 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)

   case nir_intrinsic_load_flat_mask:
   case nir_intrinsic_load_provoking_last:
-   case nir_intrinsic_load_input_topology_agx:
+   case nir_intrinsic_load_input_topology_agx: {
      /* Lowering the same in both GS variants */
-      return lower_id(b, intr, data);
+      bool lower_instance = true;
+      return lower_id(b, intr, &lower_instance);
+   }

   case nir_intrinsic_end_primitive_with_counter:
   case nir_intrinsic_set_vertex_and_primitive_count:
@ -1202,8 +1233,9 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,

   *gs_copy = agx_nir_create_gs_rast_shader(gs, libagx);

+   bool lower_instance = true;
   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
-            nir_metadata_block_index | nir_metadata_dominance, NULL);
+            nir_metadata_block_index | nir_metadata_dominance, &lower_instance);

   link_libagx(gs, libagx);

@ -1280,8 +1312,9 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,

   NIR_PASS(_, gs, nir_opt_sink, ~0);
   NIR_PASS(_, gs, nir_opt_move, ~0);
+
   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
-            nir_metadata_block_index | nir_metadata_dominance, NULL);
+            nir_metadata_block_index | nir_metadata_dominance, &lower_instance);

   /* Create auxiliary programs */
   *pre_gs = agx_nir_create_pre_gs(
@ -1334,22 +1367,20 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)

 bool
 agx_nir_lower_vs_before_gs(struct nir_shader *vs,
-                           const struct nir_shader *libagx,
-                           unsigned index_size_B, uint64_t *outputs)
+                           const struct nir_shader *libagx, uint64_t *outputs)
 {
   bool progress = false;

-   /* Lower vertex ID to an index buffer pull without a topology applied */
-   progress |= agx_nir_lower_index_buffer(vs, index_size_B, false);
-
   /* Lower vertex stores to memory stores */
   progress |= nir_shader_intrinsics_pass(
      vs, lower_vs_before_gs, nir_metadata_block_index | nir_metadata_dominance,
-      &index_size_B);
+      NULL);

-   /* Lower instance ID and num vertices */
+   /* Lower num vertices */
+   bool lower_instance = false;
   progress |= nir_shader_intrinsics_pass(
-      vs, lower_id, nir_metadata_block_index | nir_metadata_dominance, NULL);
+      vs, lower_id, nir_metadata_block_index | nir_metadata_dominance,
+      &lower_instance);

   /* Link libagx, used in lower_vs_before_gs */
   if (progress)
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -30,9 +30,11 @@ struct nir_def *agx_vertex_id_for_topology_class(struct nir_builder *b,
 bool agx_nir_lower_index_buffer(struct nir_shader *s, unsigned index_size_B,
                                bool patches);

+bool agx_nir_lower_sw_vs_id(nir_shader *s);
+
 bool agx_nir_lower_vs_before_gs(struct nir_shader *vs,
                                const struct nir_shader *libagx,
-                                unsigned index_size_B, uint64_t *outputs);
+                                uint64_t *outputs);

 bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx,
                      bool rasterizer_discard, struct nir_shader **gs_count,
--- a/src/asahi/lib/agx_nir_lower_msaa.c
+++ b/src/asahi/lib/agx_nir_lower_msaa.c
@ -8,17 +8,19 @@
 #include "agx_tilebuffer.h"
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"

 static bool
-lower_wrapped(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+lower_to_per_sample(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   nir_def *sample_id = data;
   b->cursor = nir_before_instr(&intr->instr);

   switch (intr->intrinsic) {
   case nir_intrinsic_load_sample_id: {
-      unsigned size = intr->def.bit_size;
-      nir_def_rewrite_uses(&intr->def, nir_u2uN(b, sample_id, size));
+      nir_def *mask = nir_u2u32(b, nir_load_active_samples_agx(b));
+      nir_def *bit = nir_ufind_msb(b, mask);
+      nir_def_rewrite_uses(&intr->def, nir_u2uN(b, bit, intr->def.bit_size));
      nir_instr_remove(&intr->instr);
      return true;
   }
@ -26,15 +28,17 @@ lower_wrapped(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   case nir_intrinsic_load_local_pixel_agx:
   case nir_intrinsic_store_local_pixel_agx:
   case nir_intrinsic_store_zs_agx:
-   case nir_intrinsic_discard_agx: {
-      /* Fragment I/O inside the loop should only affect one sample. */
+   case nir_intrinsic_discard_agx:
+   case nir_intrinsic_sample_mask_agx: {
+      /* Fragment I/O inside the loop should only affect active samples. */
      unsigned mask_index =
         (intr->intrinsic == nir_intrinsic_store_local_pixel_agx) ? 1 : 0;

      nir_def *mask = intr->src[mask_index].ssa;
-      nir_def *id_mask = nir_ishl(b, nir_imm_intN_t(b, 1, mask->bit_size),
-                                  nir_u2u32(b, sample_id));
-      nir_src_rewrite(&intr->src[mask_index], nir_iand(b, mask, id_mask));
+      nir_def *id_mask = nir_load_active_samples_agx(b);
+      nir_def *converted = nir_u2uN(b, id_mask, mask->bit_size);
+
+      nir_src_rewrite(&intr->src[mask_index], nir_iand(b, mask, converted));
      return true;
   }

@ -43,6 +47,25 @@ lower_wrapped(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   }
 }

+bool
+agx_nir_lower_to_per_sample(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(
+      shader, lower_to_per_sample,
+      nir_metadata_block_index | nir_metadata_dominance, NULL);
+}
+
+static bool
+lower_active_samples(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_active_samples_agx)
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_def_rewrite_uses(&intr->def, data);
+   return true;
+}
+
 /*
 * In a monolithic pixel shader, we wrap the fragment shader in a loop over
 * each sample, and then let optimizations (like loop unrolling) go to town.
@ -65,138 +88,57 @@ agx_nir_wrap_per_sample_loop(nir_shader *shader, uint8_t nr_samples)

   nir_variable *i =
      nir_local_variable_create(impl, glsl_uintN_t_type(16), NULL);
-   nir_store_var(&b, i, nir_imm_intN_t(&b, 0, 16), ~0);
-   nir_def *index = NULL;
+   nir_store_var(&b, i, nir_imm_intN_t(&b, 1, 16), ~0);
+   nir_def *bit = NULL;
+   nir_def *end_bit = nir_imm_intN_t(&b, 1 << nr_samples, 16);

   /* Create a loop in the wrapped function */
   nir_loop *loop = nir_push_loop(&b);
   {
-      index = nir_load_var(&b, i);
-      nir_push_if(&b, nir_uge(&b, index, nir_imm_intN_t(&b, nr_samples, 16)));
+      bit = nir_load_var(&b, i);
+      nir_push_if(&b, nir_uge(&b, bit, end_bit));
      {
         nir_jump(&b, nir_jump_break);
      }
      nir_pop_if(&b, NULL);

      b.cursor = nir_cf_reinsert(&list, b.cursor);
-      nir_store_var(&b, i, nir_iadd_imm(&b, index, 1), ~0);
+      nir_store_var(&b, i, nir_ishl_imm(&b, bit, 1), ~0);
   }
   nir_pop_loop(&b, loop);

   /* We've mucked about with control flow */
   nir_metadata_preserve(impl, nir_metadata_none);

-   /* Use the loop counter as the sample ID each iteration */
-   nir_shader_intrinsics_pass(shader, lower_wrapped,
+   /* Use the loop variable for the active sampple mask each iteration */
+   nir_shader_intrinsics_pass(shader, lower_active_samples,
                              nir_metadata_block_index | nir_metadata_dominance,
-                              index);
+                              bit);
   return true;
 }

-static bool
-lower_sample_mask_write(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   struct agx_msaa_state *state = data;
-   b->cursor = nir_before_instr(&intr->instr);
-
-   if (intr->intrinsic != nir_intrinsic_store_output)
-      return false;
-
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-   if (sem.location != FRAG_RESULT_SAMPLE_MASK)
-      return false;
-
-   /* Sample mask writes are ignored unless multisampling is used. If it is
-    * used, the Vulkan spec says:
-    *
-    *    If sample shading is enabled, bits written to SampleMask
-    *    corresponding to samples that are not being shaded by the fragment
-    *    shader invocation are ignored.
-    *
-    * That will be satisfied by outputting gl_SampleMask for the whole pixel
-    * and then lowering sample shading after (splitting up discard targets).
-    */
-   if (state->nr_samples != 1) {
-      nir_discard_agx(b, nir_inot(b, nir_u2u16(b, intr->src[0].ssa)));
-      b->shader->info.fs.uses_discard = true;
-   }
-
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-/*
- * Apply API sample mask to sample mask inputs, lowering:
- *
- *    sample_mask_in --> sample_mask_in & api_sample_mask
- */
-static bool
-lower_sample_mask_read(nir_builder *b, nir_intrinsic_instr *intr,
-                       UNUSED void *_)
-{
-   b->cursor = nir_after_instr(&intr->instr);
-
-   if (intr->intrinsic != nir_intrinsic_load_sample_mask_in)
-      return false;
-
-   nir_def *old = &intr->def;
-   nir_def *lowered = nir_iand(
-      b, old, nir_u2uN(b, nir_load_api_sample_mask_agx(b), old->bit_size));
-
-   nir_def_rewrite_uses_after(old, lowered, lowered->parent_instr);
-   return true;
-}
-
-/* glSampleMask(x) --> gl_SampleMask = x */
-static void
-insert_sample_mask_write(nir_shader *s)
-{
-   nir_builder b;
-   nir_function_impl *impl = nir_shader_get_entrypoint(s);
-   b = nir_builder_at(nir_before_impl(impl));
-
-   /* Kill samples that are NOT covered by the mask */
-   nir_discard_agx(&b, nir_inot(&b, nir_load_api_sample_mask_agx(&b)));
-   s->info.fs.uses_discard = true;
-}
-
 /*
 * Lower a fragment shader into a monolithic pixel shader, with static sample
 * count, blend state, and tilebuffer formats in the shader key. For dynamic,
 * epilogs must be used, which have separate lowerings.
 */
 bool
-agx_nir_lower_monolithic_msaa(nir_shader *shader, struct agx_msaa_state *state)
+agx_nir_lower_monolithic_msaa(nir_shader *shader, uint8_t nr_samples)
 {
   assert(shader->info.stage == MESA_SHADER_FRAGMENT);
-   assert(state->nr_samples == 1 || state->nr_samples == 2 ||
-          state->nr_samples == 4);
-
-   /* Lower gl_SampleMask writes */
-   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
-      nir_shader_intrinsics_pass(
-         shader, lower_sample_mask_write,
-         nir_metadata_block_index | nir_metadata_dominance, state);
-   }
-
-   /* Lower API sample masks */
-   if ((state->nr_samples > 1) && state->api_sample_mask)
-      insert_sample_mask_write(shader);
-
-   /* Additional, sample_mask_in needs to account for the API-level mask */
-   nir_shader_intrinsics_pass(shader, lower_sample_mask_read,
-                              nir_metadata_block_index | nir_metadata_dominance,
-                              &state->nr_samples);
+   assert(nr_samples == 1 || nr_samples == 2 || nr_samples == 4);

   agx_nir_lower_sample_mask(shader);

   /* In single sampled programs, interpolateAtSample needs to return the
-    * center pixel. TODO: Generalize for dynamic sample count.
+    * center pixel.
    */
-   if (state->nr_samples == 1)
+   if (nr_samples == 1)
      nir_lower_single_sampled(shader);
-   else if (shader->info.fs.uses_sample_shading)
-      agx_nir_wrap_per_sample_loop(shader, state->nr_samples);
+   else if (shader->info.fs.uses_sample_shading) {
+      agx_nir_lower_to_per_sample(shader);
+      agx_nir_wrap_per_sample_loop(shader, nr_samples);
+   }

   return true;
 }
--- a/src/asahi/lib/agx_nir_lower_sample_intrinsics.c
+++ b/src/asahi/lib/agx_nir_lower_sample_intrinsics.c
@ -5,18 +5,22 @@
 */

 #include "agx_tilebuffer.h"
+#include "nir.h"
 #include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"

 static nir_def *
-mask_by_sample_id(nir_builder *b, nir_def *mask)
+select_if_msaa_else_0(nir_builder *b, nir_def *x)
 {
-   nir_def *id_mask =
-      nir_ishl(b, nir_imm_intN_t(b, 1, mask->bit_size), nir_load_sample_id(b));
-   return nir_iand(b, mask, id_mask);
+   /* Sample count > 1 <==> log2(Sample count) > 0 */
+   nir_def *msaa = nir_ugt_imm(b, nir_load_samples_log2_agx(b), 0);
+
+   return nir_bcsel(b, msaa, x, nir_imm_intN_t(b, 0, x->bit_size));
 }

 static bool
-lower_to_sample(nir_builder *b, nir_intrinsic_instr *intr, void *_)
+lower(nir_builder *b, nir_intrinsic_instr *intr, void *_)
 {
   b->cursor = nir_before_instr(&intr->instr);

@ -58,20 +62,33 @@ lower_to_sample(nir_builder *b, nir_intrinsic_instr *intr, void *_)
   }

   case nir_intrinsic_load_sample_mask_in: {
-      /* In OpenGL, gl_SampleMaskIn is only supposed to have the single bit set
-       * of the sample currently being shaded when sample shading is used. Mask
-       * by the sample ID to make that happen.
+      /* Apply API sample mask to sample mask inputs, lowering:
+       *
+       *     sample_mask_in --> sample_mask_in & api_sample_mask
+       *
+       * Furthermore in OpenGL, gl_SampleMaskIn is only supposed to have the
+       * single bit set of the sample currently being shaded when sample shading
+       * is used. Mask by the sample ID to make that happen.
       */
      b->cursor = nir_after_instr(&intr->instr);
      nir_def *old = &intr->def;
-      nir_def *lowered = mask_by_sample_id(b, old);
+      nir_def *lowered = nir_iand(
+         b, old, nir_u2uN(b, nir_load_api_sample_mask_agx(b), old->bit_size));
+
+      if (b->shader->info.fs.uses_sample_shading) {
+         nir_def *bit = nir_load_active_samples_agx(b);
+         lowered = nir_iand(b, lowered, nir_u2uN(b, bit, old->bit_size));
+      }
+
      nir_def_rewrite_uses_after(old, lowered, lowered->parent_instr);
      return true;
   }

   case nir_intrinsic_load_barycentric_sample: {
      /* Lower fragment varyings with "sample" interpolation to
-       * interpolateAtSample() with the sample ID
+       * interpolateAtSample() with the sample ID. If multisampling is disabled,
+       * the sample ID is 0, so we don't need to mask unlike for
+       * load_barycentric_at_sample.
       */
      b->cursor = nir_after_instr(&intr->instr);
      nir_def *old = &intr->def;
@ -84,6 +101,56 @@ lower_to_sample(nir_builder *b, nir_intrinsic_instr *intr, void *_)
      return true;
   }

+   case nir_intrinsic_load_barycentric_at_sample: {
+      /*
+       * In OpenGL, interpolateAtSample interpolates at the centre when
+       * multisampling is disabled. Furthermore, results are undefined when
+       * multisampling is enabled but the sample ID is out-of-bounds.
+       *
+       * To handle the former case, we force the sample ID to 0 when
+       * multisampling is disabled. To optimize the latter case, we force the
+       * sample ID to 0 when the requested sample is definitively out-of-bounds.
+       */
+      b->cursor = nir_before_instr(&intr->instr);
+
+      nir_src *src = &intr->src[0];
+      nir_def *sample = src->ssa;
+
+      if (nir_src_is_const(*src) && nir_src_as_uint(*src) >= 4) {
+         sample = nir_imm_int(b, 0);
+      } else {
+         sample = select_if_msaa_else_0(b, sample);
+      }
+
+      nir_src_rewrite(src, sample);
+      return true;
+   }
+
+   case nir_intrinsic_store_output: {
+      /*
+       * Sample mask writes are ignored unless multisampling is used. If it is
+       * used, the Vulkan spec says:
+       *
+       *    If sample shading is enabled, bits written to SampleMask
+       *    corresponding to samples that are not being shaded by the fragment
+       *    shader invocation are ignored.
+       *
+       * That will be satisfied by outputting gl_SampleMask for the whole pixel
+       * and then lowering sample shading after (splitting up discard targets).
+       */
+      nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+      if (sem.location != FRAG_RESULT_SAMPLE_MASK)
+         return false;
+
+      nir_def *mask = nir_inot(b, nir_u2u16(b, intr->src[0].ssa));
+
+      nir_discard_agx(b, select_if_msaa_else_0(b, mask));
+      nir_instr_remove(&intr->instr);
+
+      b->shader->info.fs.uses_discard = true;
+      return true;
+   }
+
   default:
      return false;
   }
@ -105,14 +172,6 @@ lower_to_sample(nir_builder *b, nir_intrinsic_instr *intr, void *_)
 bool
 agx_nir_lower_sample_intrinsics(nir_shader *shader)
 {
-   /* If sample shading is disabled, the unlowered shader will broadcast pixel
-    * values across the sample (the default). By definition, there are no sample
-    * position or sample barycentrics, as these trigger sample shading.
-    */
-   if (!shader->info.fs.uses_sample_shading)
-      return false;
-
   return nir_shader_intrinsics_pass(
-      shader, lower_to_sample,
-      nir_metadata_block_index | nir_metadata_dominance, NULL);
+      shader, lower, nir_metadata_block_index | nir_metadata_dominance, NULL);
 }
--- a/src/asahi/lib/agx_nir_prolog_epilog.c
+++ b/src/asahi/lib/agx_nir_prolog_epilog.c
@ -0,0 +1,580 @@
+/*
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "gallium/include/pipe/p_defines.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_gs.h"
+#include "agx_nir_lower_vbo.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "agx_tilebuffer.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_lower_blend.h"
+#include "shader_enums.h"
+
+/*
+ * Insert code into a fragment shader to lower polygon stipple. The stipple is
+ * passed in a sideband, rather than requiring a texture binding. This is
+ * simpler for drivers to integrate and might be more efficient.
+ */
+static bool
+agx_nir_lower_poly_stipple(nir_shader *s)
+{
+   assert(s->info.stage == MESA_SHADER_FRAGMENT);
+
+   /* Insert at the beginning for performance. */
+   nir_builder b_ =
+      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
+   nir_builder *b = &b_;
+
+   /* The stipple coordinate is defined at the window coordinate mod 32. It's
+    * reversed along the X-axis to simplify the driver, hence the NOT.
+    */
+   nir_def *raw = nir_u2u32(b, nir_load_pixel_coord(b));
+   nir_def *coord = nir_umod_imm(
+      b,
+      nir_vec2(b, nir_inot(b, nir_channel(b, raw, 0)), nir_channel(b, raw, 1)),
+      32);
+
+   /* Extract the column from the packed bitfield */
+   nir_def *pattern = nir_load_polygon_stipple_agx(b, nir_channel(b, coord, 1));
+   nir_def *bit = nir_ubitfield_extract(b, pattern, nir_channel(b, coord, 0),
+                                        nir_imm_int(b, 1));
+
+   /* Discard fragments where the pattern is 0 */
+   nir_discard_if(b, nir_ieq_imm(b, bit, 0));
+   s->info.fs.uses_discard = true;
+
+   nir_metadata_preserve(b->impl,
+                         nir_metadata_dominance | nir_metadata_block_index);
+   return true;
+}
+
+static bool
+lower_vbo(nir_shader *s, const struct agx_velem_key *key)
+{
+   struct agx_attribute out[AGX_MAX_VBUFS];
+
+   for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
+      out[i] = (struct agx_attribute){
+         .divisor = key[i].divisor,
+         .stride = key[i].stride,
+         .format = key[i].format,
+      };
+   }
+
+   return agx_nir_lower_vbo(s, out);
+}
+
+static int
+map_vs_part_uniform(nir_intrinsic_instr *intr, unsigned nr_attribs)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_vbo_base_agx:
+      return 4 * nir_src_as_uint(intr->src[0]);
+   case nir_intrinsic_load_attrib_clamp_agx:
+      return (4 * nr_attribs) + (2 * nir_src_as_uint(intr->src[0]));
+   case nir_intrinsic_load_base_instance:
+      return (6 * nr_attribs);
+   case nir_intrinsic_load_first_vertex:
+      return (6 * nr_attribs) + 2;
+   case nir_intrinsic_load_input_assembly_buffer_agx:
+      return (6 * nr_attribs) + 4;
+   default:
+      return -1;
+   }
+}
+
+static int
+map_fs_part_uniform(nir_intrinsic_instr *intr)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_blend_const_color_r_float:
+      return 4;
+   case nir_intrinsic_load_blend_const_color_g_float:
+      return 6;
+   case nir_intrinsic_load_blend_const_color_b_float:
+      return 8;
+   case nir_intrinsic_load_blend_const_color_a_float:
+      return 10;
+   default:
+      return -1;
+   }
+}
+
+static bool
+lower_non_monolithic_uniforms(nir_builder *b, nir_intrinsic_instr *intr,
+                              void *data)
+{
+   int unif;
+   if (b->shader->info.stage == MESA_SHADER_VERTEX) {
+      unsigned *nr_attribs = data;
+      unif = map_vs_part_uniform(intr, *nr_attribs);
+   } else {
+      unif = map_fs_part_uniform(intr);
+   }
+
+   if (unif >= 0) {
+      b->cursor = nir_instr_remove(&intr->instr);
+      nir_def *load = nir_load_preamble(b, 1, intr->def.bit_size, .base = unif);
+      nir_def_rewrite_uses(&intr->def, load);
+      return true;
+   } else if (intr->intrinsic == nir_intrinsic_load_texture_handle_agx) {
+      b->cursor = nir_instr_remove(&intr->instr);
+      nir_def *offs =
+         nir_imul_imm(b, nir_u2u32(b, intr->src[0].ssa), AGX_TEXTURE_LENGTH);
+      nir_def_rewrite_uses(&intr->def, nir_vec2(b, nir_imm_int(b, 0), offs));
+      return true;
+   } else {
+      return false;
+   }
+}
+
+void
+agx_nir_vs_prolog(nir_builder *b, const void *key_)
+{
+   const struct agx_vs_prolog_key *key = key_;
+   b->shader->info.stage = MESA_SHADER_VERTEX;
+   b->shader->info.name = "VS prolog";
+
+   /* First, construct a passthrough shader reading each attribute and exporting
+    * the value. We also need to export vertex/instance ID in their usual regs.
+    */
+   unsigned i = 0;
+   nir_def *vec = NULL;
+   unsigned vec_idx = ~0;
+   BITSET_FOREACH_SET(i, key->component_mask, VERT_ATTRIB_MAX * 4) {
+      unsigned a = i / 4;
+      unsigned c = i % 4;
+
+      if (vec_idx != a) {
+         vec = nir_load_input(b, 4, 32, nir_imm_int(b, 0), .base = a);
+      }
+
+      /* ABI: attributes passed starting at r8 */
+      nir_export_agx(b, nir_channel(b, vec, c), .base = 2 * (8 + i));
+   }
+
+   nir_export_agx(b, nir_load_vertex_id(b), .base = 5 * 2);
+   nir_export_agx(b, nir_load_instance_id(b), .base = 6 * 2);
+
+   /* Now lower the resulting program using the key */
+   lower_vbo(b->shader, key->attribs);
+
+   if (!key->hw) {
+      agx_nir_lower_index_buffer(b->shader, key->sw_index_size_B, false);
+      agx_nir_lower_sw_vs_id(b->shader);
+   }
+
+   /* Finally, lower uniforms according to our ABI */
+   unsigned nr = DIV_ROUND_UP(BITSET_LAST_BIT(key->component_mask), 4);
+   nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
+                              nir_metadata_dominance | nir_metadata_block_index,
+                              &nr);
+   b->shader->info.io_lowered = true;
+}
+
+static bool
+lower_input_to_prolog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   unsigned idx = nir_src_as_uint(intr->src[0]) + nir_intrinsic_base(intr);
+   unsigned comp = nir_intrinsic_component(intr);
+
+   assert(intr->def.bit_size == 32 && "todo: push conversions up?");
+   unsigned base = 4 * idx + comp;
+
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *val = nir_load_exported_agx(
+      b, intr->def.num_components, intr->def.bit_size, .base = 16 + 2 * base);
+
+   BITSET_WORD *comps_read = data;
+   nir_component_mask_t mask = nir_def_components_read(&intr->def);
+
+   u_foreach_bit(c, mask) {
+      BITSET_SET(comps_read, base + c);
+   }
+
+   nir_def_rewrite_uses(&intr->def, val);
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+bool
+agx_nir_lower_vs_input_to_prolog(nir_shader *s,
+                                 BITSET_WORD *attrib_components_read)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_input_to_prolog,
+      nir_metadata_dominance | nir_metadata_block_index,
+      attrib_components_read);
+}
+
+static bool
+lower_active_samples_to_register(nir_builder *b, nir_intrinsic_instr *intr,
+                                 void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_active_samples_agx)
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+
+   /* ABI: r0h contains the active sample mask */
+   nir_def *id = nir_load_exported_agx(b, 1, 16, .base = 1);
+   nir_def_rewrite_uses(&intr->def, id);
+   return true;
+}
+
+static bool
+lower_tests_zs_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   bool *value = data;
+   if (intr->intrinsic != nir_intrinsic_load_shader_part_tests_zs_agx)
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_def_rewrite_uses(&intr->def, nir_imm_intN_t(b, *value ? 0xFF : 0, 16));
+   return true;
+}
+
+static bool
+lower_tests_zs(nir_shader *s, bool value)
+{
+   if (!s->info.fs.uses_discard)
+      return false;
+
+   return nir_shader_intrinsics_pass(
+      s, lower_tests_zs_intr, nir_metadata_dominance | nir_metadata_block_index,
+      &value);
+}
+
+static inline bool
+blend_uses_2src(nir_lower_blend_rt rt)
+{
+   enum pipe_blendfactor factors[] = {
+      rt.rgb.src_factor,
+      rt.rgb.dst_factor,
+      rt.alpha.src_factor,
+      rt.alpha.dst_factor,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(factors); ++i) {
+      switch (factors[i]) {
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+         return true;
+      default:
+         break;
+      }
+   }
+
+   return false;
+}
+
+void
+agx_nir_fs_epilog(nir_builder *b, const void *key_)
+{
+   const struct agx_fs_epilog_key *key = key_;
+   b->shader->info.stage = MESA_SHADER_FRAGMENT;
+   b->shader->info.name = "FS epilog";
+
+   /* First, construct a passthrough shader reading each colour and outputting
+    * the value.
+    */
+   u_foreach_bit(rt, key->rt_written) {
+      bool dual_src = (rt == 1) && blend_uses_2src(key->blend.rt[0]);
+      unsigned read_rt = (key->link.broadcast_rt0 && !dual_src) ? 0 : rt;
+      unsigned size = (key->link.size_32 & BITFIELD_BIT(read_rt)) ? 32 : 16;
+
+      nir_def *value =
+         nir_load_exported_agx(b, 4, size, .base = 2 * (4 + (4 * read_rt)));
+
+      if (key->link.rt0_w_1 && read_rt == 0) {
+         value =
+            nir_vector_insert_imm(b, value, nir_imm_floatN_t(b, 1.0, size), 3);
+      }
+
+      nir_store_output(
+         b, value, nir_imm_int(b, 0),
+         .io_semantics.location = FRAG_RESULT_DATA0 + (dual_src ? 0 : rt),
+         .io_semantics.dual_source_blend_index = dual_src);
+   }
+
+   if (key->link.sample_shading) {
+      /* Ensure the sample ID is preserved in register */
+      nir_export_agx(b, nir_load_exported_agx(b, 1, 16, .base = 1), .base = 1);
+   }
+
+   /* Now lower the resulting program using the key */
+   struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
+      key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples, true);
+
+   if (key->force_small_tile)
+      tib.tile_size = (struct agx_tile_size){16, 16};
+
+   bool force_translucent = false;
+   nir_lower_blend_options opts = {
+      .scalar_blend_const = true,
+      .logicop_enable = key->blend.logicop_func != PIPE_LOGICOP_COPY,
+      .logicop_func = key->blend.logicop_func,
+   };
+
+   static_assert(ARRAY_SIZE(opts.format) == 8, "max RTs out of sync");
+   memcpy(opts.rt, key->blend.rt, sizeof(opts.rt));
+
+   for (unsigned i = 0; i < 8; ++i) {
+      opts.format[i] = key->rt_formats[i];
+   }
+
+   /* It's more efficient to use masked stores (with
+    * agx_nir_lower_tilebuffer) than to emulate colour masking with
+    * nir_lower_blend.
+    */
+   uint8_t colormasks[8] = {0};
+
+   for (unsigned i = 0; i < 8; ++i) {
+      if (key->rt_formats[i] == PIPE_FORMAT_NONE)
+         continue;
+
+      /* TODO: Flakes some dEQPs, seems to invoke UB. Revisit later.
+       * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77
+       * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.98
+       */
+      if (0 /* agx_tilebuffer_supports_mask(&tib, i) */) {
+         colormasks[i] = key->blend.rt[i].colormask;
+         opts.rt[i].colormask = (uint8_t)BITFIELD_MASK(4);
+      } else {
+         colormasks[i] = (uint8_t)BITFIELD_MASK(4);
+      }
+
+      /* If not all bound RTs are fully written to, we need to force
+       * translucent pass type. agx_nir_lower_tilebuffer will take
+       * care of this for its own colormasks input.
+       */
+      unsigned comps = util_format_get_nr_components(key->rt_formats[i]);
+      if ((opts.rt[i].colormask & BITFIELD_MASK(comps)) !=
+          BITFIELD_MASK(comps)) {
+         force_translucent = true;
+      }
+   }
+
+   /* Alpha-to-coverage must be lowered before alpha-to-one */
+   if (key->blend.alpha_to_coverage)
+      NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_coverage, tib.nr_samples);
+
+   /* Depth/stencil writes must be deferred until after all discards,
+    * particularly alpha-to-coverage.
+    */
+   if (key->link.write_z || key->link.write_s) {
+      nir_store_zs_agx(
+         b, nir_imm_intN_t(b, 0xFF, 16),
+         nir_load_exported_agx(b, 1, 32, .base = 4),
+         nir_load_exported_agx(b, 1, 16, .base = 6),
+         .base = (key->link.write_z ? 1 : 0) | (key->link.write_s ? 2 : 0));
+
+      if (key->link.write_z)
+         b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DEPTH);
+
+      if (key->link.write_s)
+         b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+   }
+
+   /* Alpha-to-one must be lowered before blending */
+   if (key->blend.alpha_to_one)
+      NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_one);
+
+   NIR_PASS(_, b->shader, nir_lower_blend, &opts);
+
+   unsigned rt_spill = key->link.rt_spill_base;
+   NIR_PASS(_, b->shader, agx_nir_lower_tilebuffer, &tib, colormasks, &rt_spill,
+            &force_translucent);
+   NIR_PASS(_, b->shader, agx_nir_lower_multisampled_image_store);
+
+   /* If the API shader runs once per sample, then the epilog runs once per
+    * sample as well, so we need to lower our code to run for a single sample.
+    *
+    * If the API shader runs once per pixel, then the epilog runs once per
+    * pixel. So we run through the monolithic MSAA lowering, which wraps the
+    * epilog in the sample loop if needed. This localizes sample shading
+    * to the epilog, when sample shading is not used but blending is.
+    */
+   if (key->link.sample_shading) {
+      NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
+      NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
+   } else {
+      NIR_PASS(_, b->shader, agx_nir_lower_monolithic_msaa, key->nr_samples);
+   }
+
+   /* Finally, lower uniforms according to our ABI */
+   nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
+                              nir_metadata_dominance | nir_metadata_block_index,
+                              NULL);
+
+   /* There is no shader part after the epilog, so we're always responsible for
+    * running our own tests.
+    */
+   NIR_PASS(_, b->shader, lower_tests_zs, true);
+
+   b->shader->info.io_lowered = true;
+   b->shader->info.fs.uses_fbfetch_output |= force_translucent;
+   b->shader->info.fs.uses_sample_shading = key->link.sample_shading;
+}
+
+static bool
+lower_output_to_epilog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   struct agx_fs_epilog_link_info *info = data;
+   if (intr->intrinsic == nir_intrinsic_store_zs_agx) {
+      assert(nir_src_as_uint(intr->src[0]) == 0xff && "msaa not yet lowered");
+      b->cursor = nir_instr_remove(&intr->instr);
+
+      unsigned base = nir_intrinsic_base(intr);
+      info->write_z = base & 1;
+      info->write_s = base & 2;
+
+      /* ABI: r2 contains the written depth */
+      if (info->write_z)
+         nir_export_agx(b, intr->src[1].ssa, .base = 4);
+
+      /* ABI: r3l contains the written stencil */
+      if (info->write_s)
+         nir_export_agx(b, intr->src[2].ssa, .base = 6);
+
+      return true;
+   }
+
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   /* Fix up gl_FragColor */
+   if (sem.location == FRAG_RESULT_COLOR) {
+      sem.location = FRAG_RESULT_DATA0;
+      info->broadcast_rt0 = true;
+   }
+
+   /* We don't use the epilog for sample mask writes */
+   if (sem.location < FRAG_RESULT_DATA0)
+      return false;
+
+   /* Determine the render target index. Dual source blending aliases a second
+    * render target, so get that out of the way now.
+    */
+   unsigned rt = sem.location - FRAG_RESULT_DATA0;
+
+   if (sem.dual_source_blend_index) {
+      assert(rt == 0);
+      rt = 1;
+      b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DATA1);
+   }
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_def *vec = intr->src[0].ssa;
+
+   if (vec->bit_size == 32)
+      info->size_32 |= BITFIELD_BIT(rt);
+   else
+      assert(vec->bit_size == 16);
+
+   uint32_t one_f = (vec->bit_size == 32 ? fui(1.0) : _mesa_float_to_half(1.0));
+
+   u_foreach_bit(c, nir_intrinsic_write_mask(intr)) {
+      nir_scalar s = nir_scalar_resolved(vec, c);
+      if (rt == 0 && c == 3 && nir_scalar_is_const(s) &&
+          nir_scalar_as_uint(s) == one_f) {
+
+         info->rt0_w_1 = true;
+      } else {
+         unsigned stride = vec->bit_size / 16;
+
+         nir_export_agx(b, nir_channel(b, vec, c),
+                        .base = (2 * (4 + (4 * rt))) + c * stride);
+      }
+   }
+
+   return true;
+}
+
+bool
+agx_nir_lower_fs_output_to_epilog(nir_shader *s,
+                                  struct agx_fs_epilog_link_info *out)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_output_to_epilog,
+      nir_metadata_dominance | nir_metadata_block_index, out);
+}
+
+bool
+agx_nir_lower_fs_active_samples_to_register(nir_shader *s)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_active_samples_to_register,
+      nir_metadata_dominance | nir_metadata_block_index, NULL);
+}
+
+static bool
+agx_nir_lower_stats_fs(nir_shader *s)
+{
+   assert(s->info.stage == MESA_SHADER_FRAGMENT);
+   nir_builder b_ =
+      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
+   nir_builder *b = &b_;
+
+   nir_def *samples = nir_bit_count(b, nir_load_sample_mask_in(b));
+   unsigned query = PIPE_STAT_QUERY_PS_INVOCATIONS;
+
+   nir_def *addr = nir_load_stat_query_address_agx(b, .base = query);
+   nir_global_atomic(b, 32, addr, samples, .atomic_op = nir_atomic_op_iadd);
+
+   nir_metadata_preserve(b->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+   return true;
+}
+
+void
+agx_nir_fs_prolog(nir_builder *b, const void *key_)
+{
+   const struct agx_fs_prolog_key *key = key_;
+   b->shader->info.stage = MESA_SHADER_FRAGMENT;
+   b->shader->info.name = "FS prolog";
+
+   /* First, insert code for any emulated features */
+   if (key->api_sample_mask != 0xff) {
+      /* Kill samples that are NOT covered by the mask */
+      nir_discard_agx(b, nir_imm_intN_t(b, key->api_sample_mask ^ 0xff, 16));
+      b->shader->info.fs.uses_discard = true;
+   }
+
+   if (key->statistics) {
+      NIR_PASS(_, b->shader, agx_nir_lower_stats_fs);
+   }
+
+   if (key->cull_distance_size) {
+      NIR_PASS(_, b->shader, agx_nir_lower_cull_distance_fs,
+               key->cull_distance_size);
+   }
+
+   if (key->polygon_stipple) {
+      NIR_PASS_V(b->shader, agx_nir_lower_poly_stipple);
+   }
+
+   /* Then, lower the prolog */
+   NIR_PASS(_, b->shader, agx_nir_lower_discard_zs_emit);
+   NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
+   NIR_PASS(_, b->shader, nir_shader_intrinsics_pass,
+            lower_non_monolithic_uniforms,
+            nir_metadata_dominance | nir_metadata_block_index, NULL);
+   NIR_PASS(_, b->shader, lower_tests_zs, key->run_zs_tests);
+
+   b->shader->info.io_lowered = true;
+}
--- a/src/asahi/lib/agx_tilebuffer.h
+++ b/src/asahi/lib/agx_tilebuffer.h
@ -92,15 +92,10 @@ bool agx_nir_lower_tilebuffer(struct nir_shader *shader,
                              uint8_t *colormasks, unsigned *bindless_base,
                              bool *translucent);

-struct agx_msaa_state {
-   uint8_t nr_samples;
-
-   /* Enable API sample mask lowering (e.g. glSampleMask) */
-   bool api_sample_mask;
-};
+bool agx_nir_lower_to_per_sample(struct nir_shader *shader);

 bool agx_nir_lower_monolithic_msaa(struct nir_shader *shader,
-                                   struct agx_msaa_state *state);
+                                   uint8_t nr_samples);

 bool agx_nir_lower_sample_intrinsics(struct nir_shader *shader);

--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@ -23,6 +23,7 @@ libasahi_lib_files = files(
  'agx_nir_lower_tilebuffer.c',
  'agx_nir_lower_uvs.c',
  'agx_nir_lower_vbo.c',
+  'agx_nir_prolog_epilog.c',
  'agx_ppp.h',
  'agx_scratch.c',
  'pool.c',
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@ -143,6 +143,11 @@ agx_batch_init(struct agx_context *ctx,
   batch->fs_preamble_scratch = 0;
   batch->cs_preamble_scratch = 0;

+   /* May get read before write, need to initialize to 0 to avoid GPU-side UAF
+    * conditions.
+    */
+   batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = 0;
+
   /* We need to emit prim state at the start. Max collides with all. */
   batch->reduced_prim = MESA_PRIM_COUNT;

--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@ -59,11 +59,17 @@ static void
 write_shader(struct blob *blob, const struct agx_compiled_shader *binary,
             bool is_root_gs)
 {
-   uint32_t shader_size = binary->bo->size;
-   blob_write_uint32(blob, shader_size);
-   blob_write_bytes(blob, binary->bo->ptr.cpu, shader_size);
-   blob_write_bytes(blob, &binary->info, sizeof(binary->info));
+   blob_write_uint32(blob, binary->b.binary_size);
+
+   if (binary->b.binary_size) {
+      blob_write_bytes(blob, binary->b.binary, binary->b.binary_size);
+   }
+
+   blob_write_bytes(blob, &binary->b.info, sizeof(binary->b.info));
   blob_write_bytes(blob, &binary->uvs, sizeof(binary->uvs));
+   blob_write_bytes(blob, &binary->attrib_components_read,
+                    sizeof(binary->attrib_components_read));
+   blob_write_bytes(blob, &binary->epilog_key, sizeof(binary->epilog_key));
   blob_write_uint32(blob, binary->push_range_count);
   blob_write_bytes(blob, binary->push,
                    sizeof(binary->push[0]) * binary->push_range_count);
@ -91,13 +97,31 @@ read_shader(struct agx_screen *screen, struct blob_reader *blob,
   binary->stage = uncompiled->type;
   binary->so = uncompiled;

-   uint32_t binary_size = blob_read_uint32(blob);
-   binary->bo = agx_bo_create(&screen->dev, binary_size,
-                              AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
-   blob_copy_bytes(blob, binary->bo->ptr.cpu, binary_size);
+   size_t size = blob_read_uint32(blob);

-   blob_copy_bytes(blob, &binary->info, sizeof(binary->info));
+   if (uncompiled->type == PIPE_SHADER_VERTEX ||
+       uncompiled->type == PIPE_SHADER_TESS_EVAL ||
+       uncompiled->type == PIPE_SHADER_FRAGMENT) {
+      binary->b.binary_size = size;
+      binary->b.binary = malloc(binary->b.binary_size);
+      blob_copy_bytes(blob, binary->b.binary, binary->b.binary_size);
+
+      if (size) {
+         binary->bo = agx_bo_create(&screen->dev, size,
+                                    AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
+         memcpy(binary->bo->ptr.cpu, binary->b.binary, size);
+      }
+   } else if (size) {
+      binary->bo = agx_bo_create(&screen->dev, size,
+                                 AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
+      blob_copy_bytes(blob, binary->bo->ptr.cpu, size);
+   }
+
+   blob_copy_bytes(blob, &binary->b.info, sizeof(binary->b.info));
   blob_copy_bytes(blob, &binary->uvs, sizeof(binary->uvs));
+   blob_copy_bytes(blob, &binary->attrib_components_read,
+                   sizeof(binary->attrib_components_read));
+   blob_copy_bytes(blob, &binary->epilog_key, sizeof(binary->epilog_key));
   binary->push_range_count = blob_read_uint32(blob);
   blob_copy_bytes(blob, binary->push,
                   sizeof(binary->push[0]) * binary->push_range_count);
@ -133,8 +157,6 @@ agx_disk_cache_store(struct disk_cache *cache,
   if (!cache)
      return;

-   assert(binary->bo->ptr.cpu != NULL && "shaders must be CPU mapped");
-
   cache_key cache_key;
   agx_disk_cache_compute_key(cache, uncompiled, key, cache_key);

--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -4,6 +4,7 @@
 */

 #include "compiler/nir/nir_builder.h"
+#include "pipe/p_defines.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
 #include "agx_nir_lower_gs.h"
@ -41,6 +42,8 @@ struct table_state {
 };

 struct state {
+   gl_shader_stage stage, hw_stage;
+
   /* Array of nir_intrinsic_instr's to fix up at the end */
   struct util_dynarray loads;

@ -185,19 +188,13 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
      return load_sysval_root(b, 1, 32, &u->fixed_point_size);
   case nir_intrinsic_load_tex_sprite_mask_agx:
      return load_sysval_root(b, 1, 16, &u->sprite_mask);
+   case nir_intrinsic_load_shader_part_tests_zs_agx:
+      return load_sysval_root(b, 1, 16, &u->no_epilog_discard);
   case nir_intrinsic_load_clip_z_coeff_agx:
      return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
   case nir_intrinsic_load_uvs_index_agx:
      return load_sysval_root(
         b, 1, 16, &u->uvs_index[nir_intrinsic_io_semantics(intr).location]);
-   case nir_intrinsic_load_polygon_stipple_agx: {
-      nir_def *base = load_sysval_root(b, 1, 64, &u->polygon_stipple);
-      nir_def *row = intr->src[0].ssa;
-      nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
-
-      return nir_load_global_constant(b, addr, 4, 1, 32);
-   }
-
   default:
      break;
   }
@ -369,6 +366,79 @@ lay_out_uniforms(struct agx_compiled_shader *shader, struct state *state)
 {
   unsigned uniform = 0;

+   if (state->stage == PIPE_SHADER_VERTEX ||
+       state->stage == PIPE_SHADER_TESS_EVAL) {
+      unsigned count =
+         DIV_ROUND_UP(BITSET_LAST_BIT(shader->attrib_components_read), 4);
+
+      struct agx_draw_uniforms *u = NULL;
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 0,
+         .table = AGX_SYSVAL_TABLE_ROOT,
+         .offset = (uintptr_t)&u->attrib_base,
+         .length = 4 * count,
+      };
+
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 4 * count,
+         .table = AGX_SYSVAL_TABLE_ROOT,
+         .offset = (uintptr_t)&u->attrib_clamp,
+         .length = 2 * count,
+      };
+
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 6 * count,
+         .table = AGX_SYSVAL_TABLE_PARAMS,
+         .offset = 4,
+         .length = 2,
+      };
+
+      uniform = (6 * count) + 2;
+
+      if (state->hw_stage == PIPE_SHADER_COMPUTE) {
+         shader->push[shader->push_range_count++] = (struct agx_push_range){
+            .uniform = (6 * count) + 2,
+            .table = AGX_SYSVAL_TABLE_PARAMS,
+            .offset = 0,
+            .length = 2,
+         };
+
+         shader->push[shader->push_range_count++] = (struct agx_push_range){
+            .uniform = (6 * count) + 4,
+            .table = AGX_SYSVAL_TABLE_ROOT,
+            .offset = (uintptr_t)&u->input_assembly,
+            .length = 4,
+         };
+
+         uniform = (6 * count) + 8;
+      }
+   } else if (state->stage == PIPE_SHADER_FRAGMENT) {
+      struct agx_draw_uniforms *u = NULL;
+      struct agx_stage_uniforms *s = NULL;
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 0,
+         .table = AGX_SYSVAL_TABLE_FS,
+         .offset = (uintptr_t)&s->texture_base,
+         .length = 4,
+      };
+
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 4,
+         .table = AGX_SYSVAL_TABLE_ROOT,
+         .offset = (uintptr_t)&u->blend_constant,
+         .length = 8,
+      };
+
+      shader->push[shader->push_range_count++] = (struct agx_push_range){
+         .uniform = 12,
+         .table = AGX_SYSVAL_TABLE_ROOT,
+         .offset = (uintptr_t)&u->tables[AGX_SYSVAL_TABLE_ROOT],
+         .length = 4,
+      };
+
+      uniform = 16;
+   }
+
   /* Lay out each system value table. We do this backwards to ensure the first
    * uniform goes to the bindless texture base.
    */
@ -425,7 +495,11 @@ agx_nir_layout_uniforms(nir_shader *shader,
                        struct agx_compiled_shader *compiled,
                        unsigned *push_size)
 {
-   struct state state = {0};
+   struct state state = {
+      .stage = compiled->stage,
+      .hw_stage = shader->info.stage,
+   };
+
   nir_shader_intrinsics_pass(shader, record_loads,
                              nir_metadata_block_index | nir_metadata_dominance,
                              &state);
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@ -556,7 +556,7 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
                                 &cb);

   struct pipe_grid_info grid = {.block = {1, 1, 1}, .grid = {1, 1, 1}};
-   agx_launch(batch, &grid, cs, PIPE_SHADER_COMPUTE);
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE);

   /* take_ownership=true so do not unreference */
   ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, true,
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -11,6 +11,7 @@
 #include "asahi/layout/layout.h"
 #include "asahi/lib/agx_bo.h"
 #include "asahi/lib/agx_device.h"
+#include "asahi/lib/agx_linker.h"
 #include "asahi/lib/agx_nir_lower_vbo.h"
 #include "asahi/lib/agx_scratch.h"
 #include "asahi/lib/agx_tilebuffer.h"
@ -162,6 +163,9 @@ struct PACKED agx_draw_uniforms {
   /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
   uint16_t clip_z_coeff;

+   /* ~0/0 boolean whether the epilog lacks any discard instrction */
+   uint16_t no_epilog_discard;
+
   /* Mapping from varying slots written by the last vertex stage to UVS
    * indices. This mapping must be compatible with the fragment shader.
    */
@ -212,15 +216,15 @@ struct agx_push_range {
 };

 struct agx_compiled_shader {
+   /* Base struct */
+   struct agx_shader_part b;
+
   /* Uncompiled shader that we belong to */
   const struct agx_uncompiled_shader *so;

   /* Mapped executable memory */
   struct agx_bo *bo;

-   /* Metadata returned from the compiler */
-   struct agx_shader_info info;
-
   /* Uniforms the driver must push */
   unsigned push_range_count;
   struct agx_push_range push[AGX_MAX_PUSH_RANGES];
@ -228,6 +232,13 @@ struct agx_compiled_shader {
   /* UVS layout for the last vertex stage */
   struct agx_unlinked_uvs_layout uvs;

+   /* For a vertex shader, the mask of vertex attributes read. Used to key the
+    * prolog so the prolog doesn't write components not actually read.
+    */
+   BITSET_DECLARE(attrib_components_read, VERT_ATTRIB_MAX * 4);
+
+   struct agx_fs_epilog_link_info epilog_key;
+
   /* Auxiliary programs, or NULL if not used */
   struct agx_compiled_shader *gs_count, *pre_gs;
   struct agx_compiled_shader *gs_copy;
@ -245,6 +256,21 @@ struct agx_compiled_shader {
   enum pipe_shader_type stage;
 };

+struct agx_fast_link_key {
+   union {
+      struct agx_vs_prolog_key vs;
+      struct agx_fs_prolog_key fs;
+   } prolog;
+
+   struct agx_compiled_shader *main;
+
+   union {
+      struct agx_fs_epilog_key fs;
+   } epilog;
+
+   unsigned nr_samples_shaded;
+};
+
 struct agx_uncompiled_shader {
   struct pipe_shader_state base;
   enum pipe_shader_type type;
@ -257,6 +283,7 @@ struct agx_uncompiled_shader {
      uint64_t inputs_linear_shaded;
      uint8_t cull_distance_size;
      bool has_edgeflags;
+      bool uses_fbfetch;

      /* Number of bindful textures, images used */
      unsigned nr_bindful_textures, nr_bindful_images;
@ -266,6 +293,9 @@ struct agx_uncompiled_shader {
   struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
   struct agx_uncompiled_shader *passthrough_tcs[32];

+   /* agx_fast_link_key -> agx_linked_shader */
+   struct hash_table *linked_shaders;
+
   uint32_t xfb_strides[4];
   bool has_xfb_info;
   bool is_xfb_passthrough;
@ -450,14 +480,6 @@ struct agx_zsa {
   uint32_t load, store;
 };

-struct agx_blend_key {
-   nir_lower_blend_rt rt[8];
-   unsigned logicop_func;
-   bool alpha_to_coverage, alpha_to_one;
-   bool padding[2];
-};
-static_assert(sizeof(struct agx_blend_key) == 232, "packed");
-
 struct agx_blend {
   struct agx_blend_key key;

@ -465,27 +487,11 @@ struct agx_blend {
   uint32_t store;
 };

-/* These parts of the vertex element affect the generated code */
-struct agx_velem_key {
-   uint32_t divisor;
-   uint16_t stride;
-   uint8_t format;
-   uint8_t pad;
-};
-
 struct asahi_vs_shader_key {
-   struct agx_velem_key attribs[AGX_MAX_VBUFS];
-
   /* If true, this is running as a hardware vertex shader. If false, this is a
    * compute job used to feed a TCS or GS.
    */
   bool hw;
-
-   union {
-      struct {
-         uint8_t index_size_B;
-      } sw;
-   } next;
 };

 struct agx_vertex_elements {
@ -498,21 +504,11 @@ struct agx_vertex_elements {
 };

 struct asahi_fs_shader_key {
-   struct agx_blend_key blend;
-
-   /* Need to count FRAGMENT_SHADER_INVOCATIONS */
-   bool statistics;
-
-   /* Set if glSampleMask() is used with a mask other than all-1s. If not, we
-    * don't want to emit lowering code for it, since it would disable early-Z.
-    */
-   bool api_sample_mask;
-   bool polygon_stipple;
-
-   uint8_t cull_distance_size;
-   uint8_t nr_samples;
   enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
+   uint8_t nr_samples;
+   bool padding[7];
 };
+static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");

 struct asahi_gs_shader_key {
   /* If true, this GS is run only for its side effects (including XFB) */
@ -598,6 +594,9 @@ struct agx_oq_heap;
 struct agx_context {
   struct pipe_context base;
   struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
+   struct {
+      struct agx_linked_shader *vs, *tcs, *tes, *gs, *fs;
+   } linked;
   uint32_t dirty;

   /* Heap for dynamic memory allocation for geometry/tessellation shaders */
@ -759,8 +758,10 @@ agx_context(struct pipe_context *pctx)
   return (struct agx_context *)pctx;
 }

+struct agx_linked_shader;
 void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
-                struct agx_compiled_shader *cs, enum pipe_shader_type stage);
+                struct agx_compiled_shader *cs,
+                struct agx_linked_shader *linked, enum pipe_shader_type stage);

 void agx_init_query_functions(struct pipe_context *ctx);