From 1f7fe678e78f7137560215a5b9dfd6c38b009836 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Fri, 27 Jun 2025 13:37:40 -0400
Subject: [PATCH] asahi,hk: significantly rework GS

get rid of the rasterizer discard variants, by pushing XFB into the hardware VS
and letting everything cascade down from there. that then means hardware VS runs
for all streams, which means we get dynamic rasterization stream selection.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35802>
---
 src/asahi/lib/agx_nir_lower_gs.c              | 847 +++++++++---------
 src/asahi/lib/agx_nir_lower_gs.h              |   9 +-
 src/asahi/libagx/geometry.cl                  |  76 +-
 src/asahi/libagx/geometry.h                   |  40 +-
 src/asahi/vulkan/hk_cmd_buffer.h              |   3 +
 src/asahi/vulkan/hk_cmd_draw.c                |  56 +-
 src/asahi/vulkan/hk_nir_lower_descriptors.c   |   3 +
 src/asahi/vulkan/hk_shader.c                  |  83 +-
 src/asahi/vulkan/hk_shader.h                  |  18 +-
 src/gallium/drivers/asahi/agx_disk_cache.c    |   7 +-
 .../drivers/asahi/agx_nir_lower_sysvals.c     |   2 +
 src/gallium/drivers/asahi/agx_state.c         |  41 +-
 src/gallium/drivers/asahi/agx_state.h         |   8 -
 13 files changed, 575 insertions(+), 618 deletions(-)

diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c
index 0202cea00a9..a3ee09f595b 100644
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2023 Alyssa Rosenzweig
  * Copyright 2023 Valve Corporation
+ * Copyright 2015 Intel Corporation
  * SPDX-License-Identifier: MIT
  */
 
@@ -22,11 +23,143 @@
 #include "nir_xfb_info.h"
 #include "shader_enums.h"
 
-#define MAX_PRIM_OUT_SIZE 3
+struct state {
+   nir_variable *vertices[NIR_MAX_XFB_STREAMS];
+   nir_variable *first_vertex[NIR_MAX_XFB_STREAMS];
+   nir_variable *xfb_count[NIR_MAX_XFB_STREAMS];
+   nir_variable *indices;
+};
+
+static void
+emit_primitive(nir_builder *b, struct state *state, unsigned stream)
+{
+   unsigned min_verts = nir_verts_in_output_prim(b->shader);
+   bool restart = min_verts > 1;
+
+   nir_def *indices = nir_load_var(b, state->indices);
+   nir_def *first_vertex = nir_load_var(b, state->first_vertex[stream]);
+   nir_def *total_vertices = nir_load_var(b, state->vertices[stream]);
+   nir_def *xfb_count = nir_load_var(b, state->xfb_count[stream]);
+   nir_def *length = nir_isub(b, total_vertices, first_vertex);
+
+   nir_emit_primitive_poly(b, indices, first_vertex, length, xfb_count, stream);
+
+   /* Allocate index buffer space */
+   nir_def *degenerate = nir_ult_imm(b, length, min_verts);
+   nir_def *added_indices = nir_iadd_imm(b, length, restart);
+   added_indices = nir_bcsel(b, degenerate, nir_imm_int(b, 0), added_indices);
+   nir_store_var(b, state->indices, nir_iadd(b, indices, added_indices), 0x1);
+
+   /* We form a new primitive for every vertex emitted after the first
+    * complete primitive (since we're outputting strips).
+    */
+   nir_def *xfb_prims = nir_iadd_imm(b, length, -(min_verts - 1));
+   xfb_prims = nir_bcsel(b, degenerate, nir_imm_int(b, 0), xfb_prims);
+   nir_store_var(b, state->xfb_count[stream], nir_iadd(b, xfb_count, xfb_prims),
+                 0x1);
+
+   nir_store_var(b, state->first_vertex[stream], total_vertices, 0x1);
+}
+
+static bool
+rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   struct state *state = state_;
+
+   if (intr->intrinsic == nir_intrinsic_emit_vertex) {
+      unsigned stream = nir_intrinsic_stream_id(intr);
+
+      nir_def *count = nir_load_var(b, state->vertices[stream]);
+      nir_select_vertex_poly(b, count, stream);
+      nir_store_var(b, state->vertices[stream], nir_iadd_imm(b, count, 1), 0x1);
+   } else if (intr->intrinsic == nir_intrinsic_end_primitive) {
+      /* Emit is deferred for points */
+      if (b->shader->info.gs.output_primitive != MESA_PRIM_POINTS)
+         emit_primitive(b, state, nir_intrinsic_stream_id(intr));
+   } else {
+      return false;
+   }
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static bool
+agx_nir_lower_gs_intrinsics(nir_shader *shader)
+{
+   struct state state;
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+   nir_builder b = nir_builder_at(nir_before_impl(impl));
+   nir_def *zero = nir_imm_int(&b, 0);
+   const glsl_type *T = glsl_uint_type();
+
+   for (unsigned i = 0; i < NIR_MAX_XFB_STREAMS; ++i) {
+      state.vertices[i] = nir_local_variable_create(impl, T, NULL);
+      state.first_vertex[i] = nir_local_variable_create(impl, T, NULL);
+      state.xfb_count[i] = nir_local_variable_create(impl, T, NULL);
+
+      nir_store_var(&b, state.vertices[i], zero, 0x1);
+      nir_store_var(&b, state.first_vertex[i], zero, 0x1);
+      nir_store_var(&b, state.xfb_count[i], zero, 0x1);
+   }
+
+   state.indices = nir_local_variable_create(impl, T, NULL);
+   nir_store_var(&b, state.indices, zero, 0x1);
+
+   /* Make sure all the primitives are ended at the end of the shader. */
+   b.cursor = nir_after_impl(impl);
+
+   u_foreach_bit(stream, shader->info.gs.active_stream_mask) {
+      nir_end_primitive(&b, stream);
+   }
+
+   nir_shader_intrinsics_pass(shader, rewrite_intrinsics,
+                              nir_metadata_control_flow, &state);
+
+   b.cursor = nir_after_impl(impl);
+
+   if (shader->info.gs.output_primitive == MESA_PRIM_POINTS) {
+      u_foreach_bit(stream, shader->info.gs.active_stream_mask) {
+         emit_primitive(&b, &state, stream);
+      }
+   }
+
+   /* If we have side effects, make sure we run the geometry shader at least
+    * once by outputting a dummy primitive if we wouldn't output anything.
+    */
+   if (shader->info.writes_memory) {
+      unsigned n = nir_verts_in_output_prim(shader);
+      shader->info.gs.vertices_out = MAX2(shader->info.gs.vertices_out, n);
+
+      nir_push_if(&b, nir_ieq_imm(&b, nir_load_var(&b, state.indices), 0));
+      {
+         nir_def *zero = nir_imm_int(&b, 0);
+         nir_def *n_ = nir_imm_int(&b, n);
+         bool restart = n > 1;
+
+         shader->info.outputs_written |= VARYING_BIT_POS;
+         nir_store_output(&b, nir_imm_float(&b, NAN), zero,
+                          .io_semantics.location = VARYING_SLOT_POS);
+         nir_select_vertex_poly(&b, zero);
+         nir_emit_primitive_poly(&b, zero, zero, n_, zero);
+         nir_store_var(&b, state.indices, nir_iadd_imm(&b, n_, restart), 1);
+      }
+      nir_pop_if(&b, NULL);
+   }
+
+   /* Report the counts */
+   for (unsigned stream = 0; stream < NIR_MAX_XFB_STREAMS; ++stream) {
+      nir_set_vertex_and_primitive_count(
+         &b, nir_imm_int(&b, 0), nir_load_var(&b, state.indices),
+         nir_load_var(&b, state.xfb_count[stream]), stream);
+   }
+
+   return nir_progress(true, impl, nir_metadata_none);
+}
 
 struct lower_gs_state {
    int static_count[MAX_VERTEX_STREAMS];
-   nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS][MAX_PRIM_OUT_SIZE];
 
    /* The index of each counter in the count buffer, or -1 if it's not in the
     * count buffer.
@@ -35,8 +168,6 @@ struct lower_gs_state {
     */
    int count_index[MAX_VERTEX_STREAMS];
 
-   bool rasterizer_discard;
-
    struct agx_gs_info *info;
 };
 
@@ -93,20 +224,6 @@ lower_store_to_var(nir_builder *b, nir_intrinsic_instr *intr,
    nir_store_var(b, var, value, BITFIELD_BIT(component));
 }
 
-static bool
-lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
-{
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   if (intr->intrinsic != nir_intrinsic_store_output)
-      return false;
-
-   lower_store_to_var(b, intr, data);
-   return true;
-}
-
 /*
  * Geometry shader invocations are compute-like:
  *
@@ -278,9 +395,9 @@ static bool
 lower_gs_count_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
    switch (intr->intrinsic) {
-   case nir_intrinsic_emit_vertex_with_counter:
-   case nir_intrinsic_end_primitive_with_counter:
    case nir_intrinsic_store_output:
+   case nir_intrinsic_select_vertex_poly:
+   case nir_intrinsic_emit_primitive_poly:
       /* These are for the main shader, just remove them */
       nir_instr_remove(&intr->instr);
       return true;
@@ -349,9 +466,12 @@ agx_nir_create_geometry_count_shader(nir_shader *gs,
 
 struct lower_gs_rast_state {
    nir_def *raw_instance_id;
-   nir_def *instance_id, *primitive_id, *output_id;
+   nir_def *instance_id, *primitive_id, *output_id, *stream;
    struct lower_output_to_var_state outputs;
    struct lower_output_to_var_state selected;
+   bool points;
+
+   nir_variable *output_strip_length, *output_strip_base, *id_in_strip;
 };
 
 static void
@@ -359,19 +479,15 @@ select_rast_output(nir_builder *b, nir_intrinsic_instr *intr,
                    struct lower_gs_rast_state *state)
 {
    b->cursor = nir_instr_remove(&intr->instr);
-
-   /* We only care about the rasterization stream in the rasterization
-    * shader, so just ignore emits from other streams.
-    */
-   if (nir_intrinsic_stream_id(intr) != 0)
-      return;
+   nir_def *us = nir_ieq(b, intr->src[0].ssa, state->output_id);
+   us = nir_iand(b, us,
+                 nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr)));
 
    u_foreach_bit64(slot, b->shader->info.outputs_written) {
       nir_def *orig = nir_load_var(b, state->selected.outputs[slot]);
       nir_def *data = nir_load_var(b, state->outputs.outputs[slot]);
 
-      nir_def *value = nir_bcsel(
-         b, nir_ieq(b, intr->src[0].ssa, state->output_id), data, orig);
+      nir_def *value = nir_bcsel(b, us, data, orig);
 
       nir_store_var(b, state->selected.outputs[slot], value,
                     nir_component_mask(value->num_components));
@@ -388,7 +504,7 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)
       lower_store_to_var(b, intr, &state->outputs);
       return true;
 
-   case nir_intrinsic_emit_vertex_with_counter:
+   case nir_intrinsic_select_vertex_poly:
       select_rast_output(b, intr, state);
       return true;
 
@@ -411,7 +527,37 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)
       return lower_id(b, intr, NULL);
    }
 
-   case nir_intrinsic_end_primitive_with_counter:
+   case nir_intrinsic_emit_primitive_poly: {
+      b->cursor = nir_before_instr(&intr->instr);
+      nir_def *id = state->output_id;
+
+      nir_def *first_id = intr->src[1].ssa;
+      nir_def *length = intr->src[2].ssa;
+      nir_def *base = intr->src[3].ssa;
+      nir_def *id_in_strip = nir_isub(b, id, first_id);
+
+      nir_def *us = nir_ult(b, id, nir_iadd(b, first_id, length));
+      us = nir_iand(b, us, nir_uge(b, id, first_id));
+      us = nir_iand(
+         b, us, nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr)));
+
+      nir_def *orig = nir_load_var(b, state->output_strip_length);
+      nir_def *value = nir_bcsel(b, us, length, orig);
+      nir_store_var(b, state->output_strip_length, value,
+                    nir_component_mask(1));
+
+      orig = nir_load_var(b, state->output_strip_base);
+      value = nir_bcsel(b, us, base, orig);
+      nir_store_var(b, state->output_strip_base, value, nir_component_mask(1));
+
+      orig = nir_load_var(b, state->id_in_strip);
+      value = nir_bcsel(b, us, id_in_strip, orig);
+      nir_store_var(b, state->id_in_strip, value, nir_component_mask(1));
+
+      nir_instr_remove(&intr->instr);
+      return true;
+   }
+
    case nir_intrinsic_set_vertex_and_primitive_count:
       nir_instr_remove(&intr->instr);
       return true;
@@ -421,101 +567,6 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    }
 }
 
-/*
- * Side effects in geometry shaders are problematic with our "GS rasterization
- * shader" implementation. Where does the side effect happen? In the prepass?
- * In the rast shader? In both?
- *
- * A perfect solution is impossible with rast shaders. Since the spec is loose
- * here, we follow the principle of "least surprise":
- *
- * 1. Prefer side effects in the prepass over the rast shader. The prepass runs
- *    once per API GS invocation so will match the expectations of buggy apps
- *    not written for tilers.
- *
- * 2. If we must execute any side effect in the rast shader, try to execute all
- *    side effects only in the rast shader. If some side effects must happen in
- *    the rast shader and others don't, this gets consistent counts
- *    (i.e. if the app expects plain stores and atomics to match up).
- *
- * 3. If we must execute side effects in both rast and the prepass,
- *    execute all side effects in the rast shader and strip what we can from
- *    the prepass. This gets the "unsurprising" behaviour from #2 without
- *    falling over for ridiculous uses of atomics.
- */
-static bool
-strip_side_effect_from_rast(nir_builder *b, nir_intrinsic_instr *intr,
-                            void *data)
-{
-   switch (intr->intrinsic) {
-   case nir_intrinsic_store_global:
-   case nir_intrinsic_global_atomic:
-   case nir_intrinsic_global_atomic_swap:
-      break;
-   default:
-      return false;
-   }
-
-   /* If there's a side effect that's actually required, keep it. */
-   if (nir_intrinsic_infos[intr->intrinsic].has_dest &&
-       !list_is_empty(&intr->def.uses)) {
-
-      bool *any = data;
-      *any = true;
-      return false;
-   }
-
-   /* Otherwise, remove the dead instruction. */
-   nir_instr_remove(&intr->instr);
-   return true;
-}
-
-static bool
-strip_side_effects_from_rast(nir_shader *s, bool *side_effects_for_rast)
-{
-   bool progress, any;
-
-   /* Rather than complex analysis, clone and try to remove as many side effects
-    * as possible. Then we check if we removed them all. We need to loop to
-    * handle complex control flow with side effects, where we can strip
-    * everything but can't figure that out with a simple one-shot analysis.
-    */
-   nir_shader *clone = nir_shader_clone(NULL, s);
-
-   /* Drop as much as we can */
-   do {
-      progress = false;
-      any = false;
-      NIR_PASS(progress, clone, nir_shader_intrinsics_pass,
-               strip_side_effect_from_rast, nir_metadata_control_flow, &any);
-
-      NIR_PASS(progress, clone, nir_opt_dce);
-      NIR_PASS(progress, clone, nir_opt_dead_cf);
-   } while (progress);
-
-   ralloc_free(clone);
-
-   /* If we need atomics, leave them in */
-   if (any) {
-      *side_effects_for_rast = true;
-      return false;
-   }
-
-   /* Else strip it all */
-   do {
-      progress = false;
-      any = false;
-      NIR_PASS(progress, s, nir_shader_intrinsics_pass,
-               strip_side_effect_from_rast, nir_metadata_control_flow, &any);
-
-      NIR_PASS(progress, s, nir_opt_dce);
-      NIR_PASS(progress, s, nir_opt_dead_cf);
-   } while (progress);
-
-   assert(!any);
-   return progress;
-}
-
 static bool
 strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr,
                             void *data)
@@ -523,17 +574,42 @@ strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr,
    switch (intr->intrinsic) {
    case nir_intrinsic_global_atomic:
    case nir_intrinsic_global_atomic_swap:
-      break;
+   case nir_intrinsic_image_atomic:
+   case nir_intrinsic_image_atomic_swap:
+   case nir_intrinsic_bindless_image_atomic:
+   case nir_intrinsic_bindless_image_atomic_swap:
+      if (list_is_empty(&intr->def.uses)) {
+         nir_instr_remove(&intr->instr);
+         return true;
+      }
+
+      return false;
+
+   case nir_intrinsic_store_global:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_bindless_image_store:
+   case nir_intrinsic_fence_pbe_to_tex_agx:
+      if (data) {
+         nir_instr_remove(&intr->instr);
+         return true;
+      }
+
+      return false;
+
    default:
       return false;
    }
+}
 
-   if (list_is_empty(&intr->def.uses)) {
-      nir_instr_remove(&intr->instr);
-      return true;
-   }
-
-   return false;
+/*
+ * The stream # is encoded into the lower bits of an index. The stream
+ * multiplier is the factor to multiply vertex IDs before adding the stream #.
+ */
+static unsigned
+stream_multiplier(const nir_shader *gs)
+{
+   unsigned nr_streams = util_last_bit(gs->info.gs.active_stream_mask);
+   return util_next_power_of_two(nr_streams);
 }
 
 /*
@@ -541,7 +617,7 @@ strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr,
  * shades each rasterized output vertex in parallel.
  */
 static nir_shader *
-agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
+agx_nir_create_gs_rast_shader(const nir_shader *gs,
                               const struct lower_gs_state *state)
 {
    /* Don't muck up the original shader */
@@ -561,18 +637,27 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
       shader->info.name = "gs rast";
    }
 
-   nir_builder b_ =
-      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader)));
-   nir_builder *b = &b_;
-
-   NIR_PASS(_, shader, strip_side_effects_from_rast, side_effects_for_rast);
-
    /* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. */
    if (shader->info.gs.output_primitive != MESA_PRIM_POINTS)
       shader->info.outputs_written &= ~VARYING_BIT_PSIZ;
 
+   nir_builder b_ =
+      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader)));
+   nir_builder *b = &b_;
+
+   const glsl_type *T = glsl_uint_type();
    nir_def *raw_vertex_id = nir_load_vertex_id(b);
-   struct lower_gs_rast_state rs = {.raw_instance_id = nir_load_instance_id(b)};
+
+   struct lower_gs_rast_state rs = {
+      .raw_instance_id = nir_load_instance_id(b),
+      .points = gs->info.gs.output_primitive == MESA_PRIM_POINTS,
+      .stream = nir_umod_imm(b, raw_vertex_id, stream_multiplier(gs)),
+      .output_strip_length = nir_local_variable_create(b->impl, T, NULL),
+      .output_strip_base = nir_local_variable_create(b->impl, T, NULL),
+      .id_in_strip = nir_local_variable_create(b->impl, T, NULL),
+   };
+
+   raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs));
 
    switch (state->info->shape) {
    case AGX_GS_SHAPE_DYNAMIC_INDEXED: {
@@ -633,6 +718,83 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
                               nir_metadata_control_flow, &rs);
 
    b->cursor = nir_after_impl(b->impl);
+   if (gs->xfb_info) {
+      unsigned n_ = mesa_vertices_per_prim(gs->info.gs.output_primitive);
+      nir_def *zero = nir_imm_int(b, 0);
+      nir_def *strip_length =
+         rs.points ? zero : nir_load_var(b, rs.output_strip_length);
+      nir_def *id_in_strip = rs.points ? zero : nir_load_var(b, rs.id_in_strip);
+      nir_def *base =
+         rs.points ? rs.output_id : nir_load_var(b, rs.output_strip_base);
+
+      struct nir_xfb_info *xfb = gs->xfb_info;
+
+      nir_def *unrolled = nir_iadd(
+         b, nir_imul(b, rs.instance_id, load_geometry_param(b, gs_grid[0])),
+         rs.primitive_id);
+
+      nir_def *n = nir_imm_int(b, n_);
+
+      for (unsigned p_ = 0; p_ < n_; ++p_) {
+         nir_def *p = nir_imm_int(b, p_);
+         nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip,
+                                                        strip_length, p));
+
+         /* Write XFB for each output */
+         for (unsigned i = 0; i < xfb->output_count; ++i) {
+            nir_xfb_output_info output = xfb->outputs[i];
+            unsigned stream = xfb->buffer_to_stream[output.buffer];
+            nir_push_if(b, nir_ieq_imm(b, rs.stream, stream));
+
+            /* Get the index of this primitive in the XFB buffer. That is, the
+             * base for this invocation for the stream plus the offset within
+             * this invocation.
+             */
+            nir_def *invocation_base = libagx_previous_xfb_primitives(
+               b, nir_load_geometry_param_buffer_agx(b),
+               nir_imm_int(b, state->static_count[stream]),
+               nir_imm_int(b, state->count_index[stream]),
+               nir_imm_int(b, state->info->count_words),
+               nir_imm_bool(b, state->info->prefix_sum), unrolled);
+
+            nir_def *index = libagx_xfb_vertex_offset(
+               b, n, invocation_base, base, id_in_strip, p,
+               nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));
+
+            nir_def *xfb_verts = load_geometry_param(b, xfb_verts[stream]);
+            nir_push_if(b, nir_ult(b, index, xfb_verts));
+            {
+               unsigned buffer = output.buffer;
+               unsigned stride = xfb->buffers[buffer].stride;
+               unsigned count = util_bitcount(output.component_mask);
+
+               nir_variable *var = rs.selected.outputs[output.location];
+               nir_def *value =
+                  var ? nir_load_var(b, var) : nir_undef(b, 4, 32);
+
+               /* In case output.component_mask contains invalid components,
+                * write out zeroes instead of blowing up validation.
+                *
+                * KHR-Single-GL44.enhanced_layouts.xfb_capture_inactive_output_component
+                * hits this.
+                */
+               value = nir_pad_vector_imm_int(b, value, 0, 4);
+
+               nir_def *addr = libagx_xfb_vertex_address(
+                  b, nir_load_geometry_param_buffer_agx(b), index,
+                  nir_imm_int(b, buffer), nir_imm_int(b, stride),
+                  nir_imm_int(b, output.offset));
+
+               nir_store_global(b, addr, 4,
+                                nir_channels(b, value, output.component_mask),
+                                nir_component_mask(count));
+            }
+            nir_pop_if(b, NULL);
+            nir_pop_if(b, NULL);
+         }
+         nir_pop_if(b, NULL);
+      }
+   }
 
    /* Forward each selected output to the rasterizer */
    u_foreach_bit64(slot, shader->info.outputs_written) {
@@ -647,15 +809,29 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
       if (slot == VARYING_SLOT_CLIP_DIST1)
          offset = 1;
 
+      /* We must only rasterize vertices from the rasterization stream. Since we
+       * shade vertices across all streams, we do this by throwing away vertices
+       * from non-rasterization streams (by setting a component to NaN).
+       */
+      if (slot == VARYING_SLOT_POS && state->info->multistream) {
+         nir_def *rast_stream = nir_load_rasterization_stream(b);
+         nir_def *nan = nir_imm_float(b, NAN);
+         nir_def *killed = nir_vector_insert_imm(b, value, nan, 3);
+
+         value =
+            nir_bcsel(b, nir_ieq(b, rs.stream, rast_stream), value, killed);
+      }
+
       nir_store_output(b, value, nir_imm_int(b, offset),
-                       .io_semantics.location = slot - offset,
-                       .io_semantics.num_slots = 1,
-                       .write_mask = nir_component_mask(value->num_components),
-                       .src_type = nir_type_uint32);
+                       .io_semantics.location = slot - offset);
    }
 
-   /* The geometry shader might not write point size - ensure it does. */
-   if (gs->info.gs.output_primitive == MESA_PRIM_POINTS) {
+   /* The geometry shader might not write point size - ensure it does, if we're
+    * rasterizing at all.
+    */
+   if (gs->info.gs.output_primitive == MESA_PRIM_POINTS &&
+       (shader->info.outputs_written & VARYING_BIT_POS)) {
+
       nir_lower_default_point_size(shader);
    }
 
@@ -663,206 +839,46 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
    return shader;
 }
 
-static void
-lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr,
-                    struct lower_gs_state *state)
-{
-   assert((intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count ||
-           b->shader->info.gs.output_primitive != MESA_PRIM_POINTS) &&
-          "endprimitive for points should've been removed");
-
-   /* The GS is the last stage before rasterization, so if we discard the
-    * rasterization, we don't output an index buffer, nothing will read it.
-    * Index buffer is only for the rasterization stream.
-    */
-   unsigned stream = nir_intrinsic_stream_id(intr);
-   if (state->rasterizer_discard || stream != 0)
-      return;
-
-   libagx_end_primitive(
-      b, load_geometry_param(b, output_index_buffer), intr->src[0].ssa,
-      intr->src[1].ssa, intr->src[2].ssa,
-      nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
-      calc_unrolled_index_id(b),
-      nir_imm_bool(b, b->shader->info.gs.output_primitive != MESA_PRIM_POINTS));
-}
-
-static void
-write_xfb(nir_builder *b, struct lower_gs_state *state, unsigned stream,
-          nir_def *index_in_strip, nir_def *prim_id_in_invocation)
-{
-   struct nir_xfb_info *xfb = b->shader->xfb_info;
-   unsigned verts = nir_verts_in_output_prim(b->shader);
-
-   /* Get the index of this primitive in the XFB buffer. That is, the base for
-    * this invocation for the stream plus the offset within this invocation.
-    */
-   nir_def *invocation_base = libagx_previous_xfb_primitives(
-      b, nir_load_geometry_param_buffer_agx(b),
-      nir_imm_int(b, state->static_count[stream]),
-      nir_imm_int(b, state->count_index[stream]),
-      nir_imm_int(b, state->info->count_words),
-      nir_imm_bool(b, state->info->prefix_sum), calc_unrolled_id(b));
-
-   nir_def *prim_index = nir_iadd(b, invocation_base, prim_id_in_invocation);
-   nir_def *base_index = nir_imul_imm(b, prim_index, verts);
-
-   nir_def *xfb_prims = load_geometry_param(b, xfb_prims[stream]);
-   nir_push_if(b, nir_ult(b, prim_index, xfb_prims));
-
-   /* Write XFB for each output */
-   for (unsigned i = 0; i < xfb->output_count; ++i) {
-      nir_xfb_output_info output = xfb->outputs[i];
-
-      /* Only write to the selected stream */
-      if (xfb->buffer_to_stream[output.buffer] != stream)
-         continue;
-
-      unsigned buffer = output.buffer;
-      unsigned stride = xfb->buffers[buffer].stride;
-      unsigned count = util_bitcount(output.component_mask);
-
-      for (unsigned vert = 0; vert < verts; ++vert) {
-         /* We write out the vertices backwards, since 0 is the current
-          * emitted vertex (which is actually the last vertex).
-          *
-          * We handle NULL var for
-          * KHR-Single-GL44.enhanced_layouts.xfb_capture_struct.
-          */
-         unsigned v = (verts - 1) - vert;
-         nir_variable *var = state->outputs[output.location][v];
-         nir_def *value = var ? nir_load_var(b, var) : nir_undef(b, 4, 32);
-
-         /* In case output.component_mask contains invalid components, write
-          * out zeroes instead of blowing up validation.
-          *
-          * KHR-Single-GL44.enhanced_layouts.xfb_capture_inactive_output_component
-          * hits this.
-          */
-         value = nir_pad_vector_imm_int(b, value, 0, 4);
-
-         nir_def *rotated_vert = nir_imm_int(b, vert);
-         if (verts == 3) {
-            /* Map vertices for output so we get consistent winding order. For
-             * the primitive index, we use the index_in_strip. This is actually
-             * the vertex index in the strip, hence
-             * offset by 2 relative to the true primitive index (#2 for the
-             * first triangle in the strip, #3 for the second). That's ok
-             * because only the parity matters.
-             */
-            rotated_vert = libagx_map_vertex_in_tri_strip(
-               b, index_in_strip, rotated_vert,
-               nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));
-         }
-
-         nir_def *addr = libagx_xfb_vertex_address(
-            b, nir_load_geometry_param_buffer_agx(b), base_index, rotated_vert,
-            nir_imm_int(b, buffer), nir_imm_int(b, stride),
-            nir_imm_int(b, output.offset));
-
-         nir_store_global(b, addr, 4,
-                          nir_channels(b, value, output.component_mask),
-                          nir_component_mask(count));
-      }
-   }
-
-   nir_pop_if(b, NULL);
-}
-
-/* Handle transform feedback for a given emit_vertex_with_counter */
-static void
-lower_emit_vertex_xfb(nir_builder *b, nir_intrinsic_instr *intr,
-                      struct lower_gs_state *state)
-{
-   /* Transform feedback is written for each decomposed output primitive. Since
-    * we're writing strips, that means we output XFB for each vertex after the
-    * first complete primitive is formed.
-    */
-   unsigned first_prim = nir_verts_in_output_prim(b->shader) - 1;
-   nir_def *index_in_strip = intr->src[1].ssa;
-
-   nir_push_if(b, nir_uge_imm(b, index_in_strip, first_prim));
-   {
-      write_xfb(b, state, nir_intrinsic_stream_id(intr), index_in_strip,
-                intr->src[3].ssa);
-   }
-   nir_pop_if(b, NULL);
-
-   /* Transform feedback writes out entire primitives during the emit_vertex. To
-    * do that, we store the values at all vertices in the strip in a little ring
-    * buffer. Index #0 is always the most recent primitive (so non-XFB code can
-    * just grab index #0 without any checking). Index #1 is the previous vertex,
-    * and index #2 is the vertex before that. Now that we've written XFB, since
-    * we've emitted a vertex we need to cycle the ringbuffer, freeing up index
-    * #0 for the next vertex that we are about to emit. We do that by copying
-    * the first n - 1 vertices forward one slot, which has to happen with a
-    * backwards copy implemented here.
-    *
-    * If we're lucky, all of these copies will be propagated away. If we're
-    * unlucky, this involves at most 2 copies per component per XFB output per
-    * vertex.
-    */
-   u_foreach_bit64(slot, b->shader->info.outputs_written) {
-      /* Note: if we're outputting points, nir_verts_in_output_prim will be 1,
-       * so this loop will not execute. This is intended: points are
-       * self-contained primitives and do not need these copies.
-       */
-      for (int v = nir_verts_in_output_prim(b->shader) - 1; v >= 1; --v) {
-         nir_def *value = nir_load_var(b, state->outputs[slot][v - 1]);
-
-         nir_store_var(b, state->outputs[slot][v], value,
-                       nir_component_mask(value->num_components));
-      }
-   }
-}
-
 static bool
-lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state)
+lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
 {
    b->cursor = nir_before_instr(&intr->instr);
-   struct lower_gs_state *state_ = state;
+   struct lower_gs_state *state = state_;
 
    switch (intr->intrinsic) {
    case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
          break;
 
-      /* Points write their index buffer here, other primitives write on end. We
-       * also pad the index buffer here for the rasterization stream.
-       */
-      if (b->shader->info.gs.output_primitive == MESA_PRIM_POINTS) {
-         lower_end_primitive(b, intr, state);
-      }
-
-      if (nir_intrinsic_stream_id(intr) == 0 && !state_->rasterizer_discard) {
-         libagx_pad_index_gs(b, load_geometry_param(b, output_index_buffer),
-                             intr->src[0].ssa, intr->src[1].ssa,
-                             calc_unrolled_id(b),
-                             nir_imm_int(b, state_->info->max_indices));
+      /* All streams are merged, just pick a single instruction */
+      if (nir_intrinsic_stream_id(intr) == 0) {
+         libagx_pad_index_gs(
+            b, load_geometry_param(b, output_index_buffer),
+            nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
+            intr->src[1].ssa, nir_imm_int(b, state->info->max_indices));
       }
 
       break;
    }
 
-   case nir_intrinsic_end_primitive_with_counter: {
-      if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+   case nir_intrinsic_emit_primitive_poly: {
+      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
          break;
 
-      unsigned min = nir_verts_in_output_prim(b->shader);
-
-      /* We only write out complete primitives */
-      nir_push_if(b, nir_uge_imm(b, intr->src[1].ssa, min));
-      {
-         lower_end_primitive(b, intr, state);
-      }
-      nir_pop_if(b, NULL);
+      libagx_write_strip(
+         b, load_geometry_param(b, output_index_buffer),
+         nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
+         intr->src[0].ssa,
+         nir_iadd(b, calc_unrolled_index_id(b), intr->src[1].ssa),
+         intr->src[2].ssa,
+         nir_imm_ivec3(b, nir_intrinsic_stream_id(intr),
+                       stream_multiplier(b->shader),
+                       nir_verts_in_output_prim(b->shader)));
       break;
    }
 
-   case nir_intrinsic_emit_vertex_with_counter:
-      /* emit_vertex triggers transform feedback but is otherwise a no-op. */
-      if (b->shader->xfb_info)
-         lower_emit_vertex_xfb(b, intr, state);
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_select_vertex_poly:
       break;
 
    default:
@@ -1012,24 +1028,14 @@ agx_nir_lower_gs_instancing(nir_shader *gs)
 }
 
 static unsigned
-calculate_max_indices(enum mesa_prim prim, unsigned verts, signed static_verts,
-                      signed static_prims)
+calculate_max_indices(enum mesa_prim prim, unsigned verts)
 {
-   /* We always have a static max_vertices, but we might have a tighter bound.
-    * Use it if we have one
-    */
-   if (static_verts >= 0) {
-      verts = MIN2(verts, static_verts);
-   }
-
    /* Points do not need primitive count added. Other topologies do. If we have
     * a static primitive count, we use that. Otherwise, we use a worst case
     * estimate that primitives are emitted one-by-one.
     */
    if (prim == MESA_PRIM_POINTS)
       return verts;
-   else if (static_prims >= 0)
-      return verts + static_prims;
    else
       return verts + (verts / mesa_vertices_per_prim(prim));
 }
@@ -1042,27 +1048,14 @@ struct topology_ctx {
 static bool
 evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   bool points = b->shader->info.gs.output_primitive == MESA_PRIM_POINTS;
-   bool end_prim = intr->intrinsic == nir_intrinsic_end_primitive_with_counter;
-   bool set_prim =
-      intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count;
-
    struct topology_ctx *ctx = data;
    struct agx_gs_info *info = ctx->info;
-   if (!(set_prim && points) && !end_prim)
+   if (intr->intrinsic != nir_intrinsic_emit_primitive_poly)
       return false;
 
-   assert(!(end_prim && points) && "should have been deleted");
-
-   /* Only consider the rasterization stream. */
-   if (nir_intrinsic_stream_id(intr) != 0)
-      return false;
-
-   /* All end primitives must be executed exactly once. That happens if
-    * everything is in the start block.
-    *
-    * Strictly we could relax this (to handle if-statements interleaved with
-    * other stuff).
+   /* All emit-primitives must execute exactly once. That happens if everything
+    * is in the start block. Strictly we could relax this (to handle
+    * if-statements interleaved with other stuff).
     */
    if (intr->instr.block != nir_start_block(b->impl)) {
       info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
@@ -1077,30 +1070,27 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
       return false;
    }
 
-   unsigned min = nir_verts_in_output_prim(b->shader);
-
-   if (nir_src_as_uint(intr->src[1]) >= min) {
-      _libagx_end_primitive(ctx->topology, nir_src_as_uint(intr->src[0]),
-                            nir_src_as_uint(intr->src[1]),
-                            nir_src_as_uint(intr->src[2]), 0, 0, !points);
-   }
-
+   _libagx_write_strip(
+      ctx->topology, nir_src_as_uint(intr->src[0]),
+      nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]),
+      nir_intrinsic_stream_id(intr), stream_multiplier(b->shader),
+      nir_verts_in_output_prim(b->shader));
    return false;
 }
 
 /*
  * Pattern match the index buffer with restart against a list topology:
  *
- *    0, 1, 2, -1, 3, 4, 5, -1, ...
+ *    0, 1, 2, -1, 3, 4, 5, ...
  */
 static bool
 match_list_topology(struct agx_gs_info *info, uint32_t count,
-                    uint32_t *topology)
+                    uint32_t *topology, bool has_restart)
 {
-   unsigned count_with_restart = count + 1;
+   unsigned count_with_restart = count + has_restart;
 
-   /* Must be an integer number of primitives */
-   if (info->max_indices % count_with_restart)
+   /* Must be an integer number of primitives. Last restart is dropped. */
+   if ((info->max_indices + has_restart) % count_with_restart)
       return false;
 
    /* Must match the list topology */
@@ -1115,7 +1105,8 @@ match_list_topology(struct agx_gs_info *info, uint32_t count,
    /* If we match, rewrite the topology and drop indexing */
    info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
    info->mode = u_decomposed_prim(info->mode);
-   info->max_indices = (info->max_indices / count_with_restart) * count;
+   info->max_indices =
+      ((info->max_indices + has_restart) / count_with_restart) * count;
    return true;
 }
 
@@ -1151,24 +1142,20 @@ static void
 optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
 {
    struct topology_ctx ctx = {.info = info};
+   bool has_restart = info->mode != MESA_PRIM_POINTS;
    nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx);
    if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED)
       return;
 
-   /* Points are always lists */
-   if (gs->info.gs.output_primitive == MESA_PRIM_POINTS) {
-      info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
-      return;
-   }
+   /* We can always drop the trailing restart index */
+   if (has_restart && info->max_indices)
+      info->max_indices--;
 
    /* Try to pattern match a list topology */
    unsigned count = nir_verts_in_output_prim(gs);
-   if (match_list_topology(info, count, ctx.topology))
+   if (match_list_topology(info, count, ctx.topology, has_restart))
       return;
 
-   /* Instancing means we can always drop the trailing restart index */
-   info->max_indices--;
-
    /* Try to pattern match a strip topology */
    if (is_strip_topology(ctx.topology, info->max_indices)) {
       info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM;
@@ -1178,6 +1165,8 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
    /* Otherwise, use a small static index buffer. There's no theoretical reason
     * to bound this, but we want small serialized shader info structs. We assume
     * that large static index buffers are rare and hence fall back to dynamic.
+    *
+    * XXX: check if this holds with streams.
     */
    if (info->max_indices >= ARRAY_SIZE(info->topology)) {
       info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
@@ -1193,9 +1182,8 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
 }
 
 bool
-agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
-                 nir_shader **gs_copy, nir_shader **pre_gs,
-                 struct agx_gs_info *info)
+agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
+                 nir_shader **pre_gs, struct agx_gs_info *info)
 {
    /* Lower I/O as assumed by the rest of GS lowering */
    if (gs->xfb_info != NULL) {
@@ -1232,13 +1220,7 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
    /* Lower geometry shader writes to contain all of the required counts, so we
     * know where in the various buffers we should write vertices.
     */
-   NIR_PASS(_, gs, nir_lower_gs_intrinsics,
-            nir_lower_gs_intrinsics_count_primitives |
-               nir_lower_gs_intrinsics_per_stream |
-               nir_lower_gs_intrinsics_count_vertices_per_primitive |
-               nir_lower_gs_intrinsics_overwrite_incomplete |
-               nir_lower_gs_intrinsics_always_end_primitive |
-               nir_lower_gs_intrinsics_count_decomposed_primitives);
+   NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics);
 
    /* Clean up after all that lowering we did */
    bool progress = false;
@@ -1265,19 +1247,17 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
    /* If we know counts at compile-time we can simplify, so try to figure out
     * the counts statically.
     */
-   struct lower_gs_state gs_state = {
-      .rasterizer_discard = rasterizer_discard,
-      .info = info,
-   };
+   struct lower_gs_state gs_state = {.info = info};
 
    *info = (struct agx_gs_info){
       .mode = gs->info.gs.output_primitive,
       .xfb = gs->xfb_info != NULL,
       .shape = -1,
+      .multistream = gs->info.gs.active_stream_mask & ~1,
    };
 
-   int static_vertices[4] = {0}, static_primitives[4] = {0};
-   nir_gs_count_vertices_and_primitives(gs, static_vertices, static_primitives,
+   int static_indices[4] = {0};
+   nir_gs_count_vertices_and_primitives(gs, NULL, static_indices,
                                         gs_state.static_count, 4);
 
    /* Anything we don't know statically will be tracked by the count buffer.
@@ -1289,21 +1269,21 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
    }
 
    /* Using the gathered static counts, choose the index buffer stride. */
-   info->max_indices = calculate_max_indices(
-      gs->info.gs.output_primitive, gs->info.gs.vertices_out,
-      static_vertices[0], static_primitives[0]);
+   info->max_indices = static_indices[0];
+   if (static_indices[0] < 0) {
+      info->max_indices = calculate_max_indices(gs->info.gs.output_primitive,
+                                                gs->info.gs.vertices_out);
+   }
 
    info->prefix_sum = info->count_words > 0 && gs->xfb_info != NULL;
 
-   if (static_vertices[0] >= 0 && static_primitives[0] >= 0) {
+   if (static_indices[0] >= 0) {
       optimize_static_topology(info, gs);
    } else {
       info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
    }
 
-   bool side_effects_for_rast = false;
-   *gs_copy =
-      agx_nir_create_gs_rast_shader(gs, &side_effects_for_rast, &gs_state);
+   *gs_copy = agx_nir_create_gs_rast_shader(gs, &gs_state);
 
    NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
             nir_metadata_control_flow, NULL);
@@ -1320,44 +1300,20 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
    else
       *gs_count = NULL;
 
-   /* Geometry shader outputs are staged to temporaries */
-   struct lower_output_to_var_state state = {0};
+   /* Strip stores and atomics */
+   do {
+      progress = false;
+      NIR_PASS(progress, gs, nir_shader_intrinsics_pass,
+               strip_side_effect_from_main, nir_metadata_control_flow,
+               (void *)true);
 
-   u_foreach_bit64(slot, gs->info.outputs_written) {
-      /* After enough optimizations, the shader metadata can go out of sync, fix
-       * with our gathered info. Otherwise glsl_vector_type will assert fail.
-       */
-      if (component_counts[slot] == 0) {
-         gs->info.outputs_written &= ~BITFIELD64_BIT(slot);
-         continue;
-      }
-
-      const char *slot_name =
-         gl_varying_slot_name_for_stage(slot, MESA_SHADER_GEOMETRY);
-
-      for (unsigned i = 0; i < MAX_PRIM_OUT_SIZE; ++i) {
-         gs_state.outputs[slot][i] = nir_variable_create(
-            gs, nir_var_shader_temp,
-            glsl_vector_type(GLSL_TYPE_UINT, component_counts[slot]),
-            ralloc_asprintf(gs, "%s-%u", slot_name, i));
-      }
-
-      state.outputs[slot] = gs_state.outputs[slot][0];
-   }
-
-   NIR_PASS(_, gs, nir_shader_instructions_pass, lower_output_to_var,
-            nir_metadata_control_flow, &state);
+      NIR_PASS(progress, gs, nir_opt_dce);
+      NIR_PASS(progress, gs, nir_opt_dead_cf);
+   } while (progress);
 
    NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_instr,
             nir_metadata_none, &gs_state);
 
-   /* Determine if we are guaranteed to rasterize at least one vertex, so that
-    * we can strip the prepass of side effects knowing they will execute in the
-    * rasterization shader.
-    */
-   bool rasterizes_at_least_one_vertex =
-      !rasterizer_discard && static_vertices[0] > 0;
-
    /* Clean up after all that lowering we did */
    nir_lower_global_vars_to_local(gs);
    do {
@@ -1376,17 +1332,16 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
 
    } while (progress);
 
-   /* When rasterizing, we try to handle side effects sensibly. */
-   if (rasterizes_at_least_one_vertex && side_effects_for_rast) {
-      do {
-         progress = false;
-         NIR_PASS(progress, gs, nir_shader_intrinsics_pass,
-                  strip_side_effect_from_main, nir_metadata_control_flow, NULL);
+   /* Strip remaining atomics, but not stores - since those are from us */
+   do {
+      progress = false;
+      NIR_PASS(progress, gs, nir_shader_intrinsics_pass,
+               strip_side_effect_from_main, nir_metadata_control_flow,
+               (void *)false);
 
-         NIR_PASS(progress, gs, nir_opt_dce);
-         NIR_PASS(progress, gs, nir_opt_dead_cf);
-      } while (progress);
-   }
+      NIR_PASS(progress, gs, nir_opt_dce);
+      NIR_PASS(progress, gs, nir_opt_dead_cf);
+   } while (progress);
 
    /* All those variables we created should've gone away by now */
    NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL);
diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h
index fc3080f1e6d..e29705a9491 100644
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@@ -38,6 +38,9 @@ struct agx_gs_info {
    /* Whether a prefix sum is required on the count outputs. Implies xfb */
    bool prefix_sum;
 
+   /* Whether the GS writes to a stream other than stream #0 */
+   bool multistream;
+
    /* Shape of the rasterization draw, named by the instance ID */
    enum agx_gs_shape shape;
 
@@ -45,9 +48,9 @@ struct agx_gs_info {
    uint8_t topology[64];
 };
 
-bool agx_nir_lower_gs(struct nir_shader *gs, bool rasterizer_discard,
-                      struct nir_shader **gs_count, struct nir_shader **gs_copy,
-                      struct nir_shader **pre_gs, struct agx_gs_info *info);
+bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
+                      struct nir_shader **gs_copy, struct nir_shader **pre_gs,
+                      struct agx_gs_info *info);
 
 bool agx_nir_lower_tcs(struct nir_shader *tcs);
 
diff --git a/src/asahi/libagx/geometry.cl b/src/asahi/libagx/geometry.cl
index f72352339ea..1798f9e1dee 100644
--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@@ -13,11 +13,12 @@
 #include "query.h"
 #include "tessellator.h"
 
-/* Swap the two non-provoking vertices third vert in odd triangles. This
- * generates a vertex ID list with a consistent winding order.
+/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
+ * ID list with a consistent winding order.
  *
- * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
- * inverse. This lets us reuse it for both vertex fetch and transform feedback.
+ * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2]
+ * is its own inverse. It is hence used both vertex fetch and transform
+ * feedback.
  */
 uint
 libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
@@ -30,12 +31,49 @@ libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
    return (provoking || even) ? vert : ((3 - pv) - vert);
 }
 
-uint64_t
-libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
-                          uint vert, uint buffer, uint stride,
-                          uint output_offset)
+static inline uint
+xfb_prim(uint id, uint n, uint copy)
+{
+   return sub_sat(id, n - 1u) + copy;
+}
+
+/*
+ * Determine whether an output vertex has an n'th copy in the transform feedback
+ * buffer. This is written weirdly to let constant folding remove unnecessary
+ * stores when length is known statically.
+ */
+bool
+libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
+{
+   uint prim = xfb_prim(id, n, copy);
+
+   int num_prims = length - (n - 1);
+   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
+}
+
+uint
+libagx_xfb_vertex_offset(uint n, uint invocation_base_prim,
+                         uint strip_base_prim, uint id_in_strip, uint copy,
+                         bool flatshade_first)
+{
+   uint prim = xfb_prim(id_in_strip, n, copy);
+   uint vert_0 = min(id_in_strip, n - 1);
+   uint vert = vert_0 - copy;
+
+   if (n == 3) {
+      vert = libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   /* Tally up in the whole buffer */
+   uint base_prim = invocation_base_prim + strip_base_prim;
+   uint base_vertex = base_prim * n;
+   return base_vertex + (prim * n) + vert;
+}
+
+uint64_t
+libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index,
+                          uint buffer, uint stride, uint output_offset)
 {
-   uint index = base_index + vert;
    uint xfb_offset = (index * stride) + output_offset;
 
    return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
@@ -572,20 +610,20 @@ libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i,
 }
 
 void
-libagx_end_primitive(global uint32_t *index_buffer, uint total_verts,
-                     uint verts_in_prim, uint total_prims, uint index_offs,
-                     uint geometry_base, bool restart)
+libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
+                   uint32_t prim_index_offset, uint32_t vertex_offset,
+                   uint32_t verts_in_prim, uint3 info)
 {
-   _libagx_end_primitive(index_buffer, total_verts, verts_in_prim, total_prims,
-                         index_offs, geometry_base, restart);
+   _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset,
+                       vertex_offset, verts_in_prim, info.x, info.y, info.z);
 }
 
 void
-libagx_pad_index_gs(global int *index_buffer, uint total_verts,
-                    uint total_prims, uint id, uint alloc)
+libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset,
+                    uint nr_indices, uint alloc)
 {
-   for (uint i = total_verts + total_prims; i < alloc; ++i) {
-      index_buffer[(id * alloc) + i] = -1;
+   for (uint i = nr_indices; i < alloc; ++i) {
+      index_buffer[inv_index_offset + i] = -1;
    }
 }
 
@@ -888,7 +926,7 @@ libagx_pre_gs(global struct agx_geometry_params *p, uint streams,
       int4 overflow = prims < in_prims;
 
       libagx_foreach_xfb(streams, i) {
-         p->xfb_prims[i] = prims[i];
+         p->xfb_verts[i] = prims[i] * vertices_per_prim;
 
          *(p->xfb_overflow[i]) += (bool)overflow[i];
          *(p->xfb_prims_generated_counter[i]) += prims[i];
diff --git a/src/asahi/libagx/geometry.h b/src/asahi/libagx/geometry.h
index 54ef991396b..a992b2ee392 100644
--- a/src/asahi/libagx/geometry.h
+++ b/src/asahi/libagx/geometry.h
@@ -227,10 +227,10 @@ struct agx_geometry_params {
 
    uint32_t xfb_size[MAX_SO_BUFFERS];
 
-   /* Number of primitives emitted by transform feedback per stream. Written by
+   /* Number of vertices emitted by transform feedback per stream. Written by
     * the pre-GS program.
     */
-   uint32_t xfb_prims[MAX_VERTEX_STREAMS];
+   uint32_t xfb_verts[MAX_VERTEX_STREAMS];
 
    /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
     * out by the GS indirect setup kernel or the CPU for a direct draw. This is
@@ -381,38 +381,26 @@ libagx_uncompact_prim(uint packed)
 }
 
 /*
- * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into
- * writes into the 32-bit output index buffer. We write the sequence (b, b + 1,
- * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
- * (count) is the number of verts in the prims, and -1 is the prim restart index
- * used to signal the end of the prim.
+ * Write a strip into a 32-bit index buffer. This is the sequence:
  *
- * For points, we write index buffers without restart, just as a sideband to
- * pass data into the vertex shader.
+ *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
+ *
+ * For points, we write index buffers without restart just for remapping.
  */
 static inline void
-_libagx_end_primitive(GLOBAL uint32_t *index_buffer, uint32_t total_verts,
-                      uint32_t verts_in_prim, uint32_t total_prims,
-                      uint32_t index_offs, uint32_t geometry_base, bool restart)
+_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
+                    uint32_t vertex_offset, uint32_t verts_in_prim,
+                    uint32_t stream, uint32_t stream_multiplier, uint32_t n)
 {
-   /* Previous verts/prims are from previous invocations plus earlier
-    * prims in this invocation. For the intra-invocation counts, we
-    * subtract the count for this prim from the inclusive sum NIR gives us.
-    */
-   uint32_t previous_verts_in_invoc = (total_verts - verts_in_prim);
-   uint32_t previous_verts = previous_verts_in_invoc;
-   uint32_t previous_prims = restart ? (total_prims - 1) : 0;
+   bool restart = n > 1;
+   if (verts_in_prim < n)
+      return;
 
-   /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
-   uint32_t index_base = geometry_base + previous_verts_in_invoc;
-
-   /* Index buffer contains 1 index for each vertex and 1 for each prim */
-   GLOBAL uint32_t *out =
-      &index_buffer[index_offs + previous_verts + previous_prims];
+   GLOBAL uint32_t *out = &index_buffer[index_offset];
 
    /* Write out indices for the strip */
    for (uint32_t i = 0; i < verts_in_prim; ++i) {
-      out[i] = index_base + i;
+      out[i] = (vertex_offset + i) * stream_multiplier + stream;
    }
 
    if (restart)
diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h
index 7de4187f57d..0c058ffe6b7 100644
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@@ -93,6 +93,9 @@ struct hk_root_descriptor_table {
          uint16_t api_gs;
          uint16_t _pad5;
 
+         uint16_t rasterization_stream;
+         uint16_t _pad6;
+
          /* Mapping from varying slots written by the last vertex stage to UVS
           * indices. This mapping must be compatible with the fragment shader.
           */
diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c
index 07d42bb8dd5..eff4394464a 100644
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@@ -139,6 +139,22 @@ vk_conv_topology(VkPrimitiveTopology topology)
    }
 }
 
+static bool
+hk_rast_discard(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   /* A non-zero rasterization stream acts as a rasterizer discard unless
+    * there's a multistream geometry shader bound.
+    */
+   if (dyn->rs.rasterization_stream != 0) {
+      struct hk_api_shader *gs = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
+      return !gs || !gs->variants[HK_GS_VARIANT_COUNT].info.gs.multistream;
+   }
+
+   return dyn->rs.rasterizer_discard_enable;
+}
+
 static void
 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
 {
@@ -1111,13 +1127,10 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 {
    struct hk_device *dev = hk_cmd_buffer_device(cmd);
    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
-   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
    struct hk_graphics_state *gfx = &cmd->state.gfx;
    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
-
-   bool rast_disc = dyn->rs.rasterizer_discard_enable;
-   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+   struct hk_shader *count = hk_count_gs_variant(gs);
 
    /* XXX: We should deduplicate this logic */
    bool indirect = agx_is_indirect(draw.b) ||
@@ -1197,6 +1210,9 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
       params.vs_grid[2] = params.gs_grid[2] = 1;
 
       if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+         /* Need to allocate heap if we haven't yet */
+         hk_heap(cmd);
+
          cmd->geom_index_buffer = dev->heap->va->addr;
          cmd->geom_index_count = dev->heap->size;
       } else {
@@ -1455,13 +1471,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
    struct agx_grid grid_vs, grid_gs;
 
-   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
-   bool rast_disc = dyn->rs.rasterizer_discard_enable;
-
    struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
-   struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
-   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
-   struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
+   struct hk_shader *main = hk_main_gs_variant(gs);
+   struct hk_shader *count = hk_count_gs_variant(gs);
+   struct hk_shader *pre_gs = hk_pre_gs_variant(gs);
 
    uint64_t geometry_params = desc->root.draw.geometry_params;
    unsigned count_words = count->info.gs.count_words;
@@ -1727,9 +1740,11 @@ hk_flush_shaders(struct hk_cmd_buffer *cmd)
 
    /* Geometry shading overrides the restart index, reemit on rebind */
    if (IS_SHADER_DIRTY(GEOMETRY)) {
+      struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
       struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
 
       desc->root.draw.api_gs = gs && !gs->is_passthrough;
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
    }
 
    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
@@ -2405,7 +2420,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
       agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
                           linked_fs->b.fragment_control) {
 
-         cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
+         cfg.tag_write_disable = hk_rast_discard(cmd);
       }
    }
 
@@ -2496,7 +2511,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
          }
 
          cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
-         cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
+         cfg.rasterizer_discard = hk_rast_discard(cmd);
 
          /* We do not support unrestricted depth, so clamping is inverted from
           * clipping. This implementation seems to pass CTS without unrestricted
@@ -2650,6 +2665,13 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       desc->root_dirty = true;
    }
 
+   if (IS_DIRTY(RS_RASTERIZATION_STREAM)) {
+      desc->root.draw.rasterization_stream = dyn->rs.rasterization_stream;
+      desc->root_dirty = true;
+
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
+   }
+
    if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
        IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
 
@@ -3131,7 +3153,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
 
       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
          cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
-         cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
+         cfg.unknown_4 = cfg.unknown_5 = hk_rast_discard(cmd);
          cfg.generate_primitive_id = gfx->generate_primitive_id;
       }
 
@@ -3562,14 +3584,6 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
 
       if (geom) {
          draw = hk_launch_gs_prerast(cmd, ccs, draw);
-
-         /* We must not draw if the app specified rasterizer discard. This is
-          * required for both performance (it is pointless to rasterize and
-          * there are no side effects), but also correctness (no indirect draw
-          * descriptor will be filled out).
-          */
-         if (dyn->rs.rasterizer_discard_enable)
-            continue;
       }
 
       if (adj) {
diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c
index c3d92c44f3c..e7c6b083fc4 100644
--- a/src/asahi/vulkan/hk_nir_lower_descriptors.c
+++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c
@@ -416,6 +416,9 @@ lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
    case nir_intrinsic_load_tess_param_buffer_agx:
       return lower_sysval_to_root_table(b, intrin, draw.tess_params);
 
+   case nir_intrinsic_load_rasterization_stream:
+      return lower_sysval_to_root_table(b, intrin, draw.rasterization_stream);
+
    case nir_intrinsic_load_is_first_fan_agx: {
       unsigned offset = hk_root_descriptor_offset(draw.provoking);
       b->cursor = nir_instr_remove(&intrin->instr);
diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c
index 0f77fc1ad32..a170d473658 100644
--- a/src/asahi/vulkan/hk_shader.c
+++ b/src/asahi/vulkan/hk_shader.c
@@ -1276,67 +1276,44 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
 
    /* Compile all variants up front */
    if (sw_stage == MESA_SHADER_GEOMETRY) {
-      for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
-         struct hk_shader *main_variant = hk_main_gs_variant(obj, rast_disc);
-         struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
-         bool last = (rast_disc + 1) == 2;
+      struct hk_shader *main_variant = hk_main_gs_variant(obj);
+      struct hk_shader *count_variant = hk_count_gs_variant(obj);
 
-         /* Each variant gets its own NIR. To save an extra clone, we use the
-          * original NIR for the last stage.
-          */
-         nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
-         nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
+      nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
 
-         NIR_PASS(_, clone, agx_nir_lower_gs, rast_disc, &count, &rast, &pre_gs,
-                  &count_variant->info.gs);
+      NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs,
+               &count_variant->info.gs);
 
-         if (!rast_disc) {
-            struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+      struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+      hk_lower_hw_vs(rast, shader, features);
+      shader->info.gs = count_variant->info.gs;
+      main_variant->info.gs = count_variant->info.gs;
 
-            hk_lower_hw_vs(rast, shader, features);
-            shader->info.gs = count_variant->info.gs;
-         }
+      struct {
+         nir_shader *in;
+         struct hk_shader *out;
+      } variants[] = {
+         {nir, hk_main_gs_variant(obj)},
+         {pre_gs, hk_pre_gs_variant(obj)},
+         {count, count_variant},
+         {rast, &obj->variants[HK_GS_VARIANT_RAST]},
+      };
 
-         main_variant->info.gs = count_variant->info.gs;
+      for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
+         if (variants[v].in) {
+            result = hk_compile_nir(
+               dev, pAllocator, variants[v].in, info->flags, info->robustness,
+               NULL, features, variants[v].out, sw_stage, true, NULL);
 
-         struct {
-            nir_shader *in;
-            struct hk_shader *out;
-         } variants[] = {
-            {clone, hk_main_gs_variant(obj, rast_disc)},
-            {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
-            {count, count_variant},
-            {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
-         };
-
-         for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
-            if (variants[v].in) {
-               result =
-                  hk_compile_nir(dev, pAllocator, variants[v].in, info->flags,
-                                 info->robustness, NULL, features,
-                                 variants[v].out, sw_stage, true, NULL);
-               if (result != VK_SUCCESS) {
-                  hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
-                  if (clone != nir) {
-                     ralloc_free(nir);
-                  }
-
-                  ralloc_free(clone);
-                  ralloc_free(pre_gs);
-                  ralloc_free(count);
-                  ralloc_free(rast);
-                  return result;
-               }
+            if (result != VK_SUCCESS) {
+               hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+               ralloc_free(nir);
+               ralloc_free(pre_gs);
+               ralloc_free(count);
+               ralloc_free(rast);
+               return result;
             }
          }
-
-         /* Nothing consumes this otherwise throw it away.
-          *
-          * TODO: We should just not generate it.
-          */
-         if (rast_disc) {
-            ralloc_free(rast);
-         }
       }
    } else if (sw_stage == MESA_SHADER_VERTEX ||
               sw_stage == MESA_SHADER_TESS_EVAL) {
diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h
index a7dac8d3f79..fcade1cba1e 100644
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@@ -183,15 +183,12 @@ enum hk_gs_variant {
 
    /* Main compute shader */
    HK_GS_VARIANT_MAIN,
-   HK_GS_VARIANT_MAIN_NO_RAST,
 
    /* Count compute shader */
    HK_GS_VARIANT_COUNT,
-   HK_GS_VARIANT_COUNT_NO_RAST,
 
    /* Pre-GS compute shader */
    HK_GS_VARIANT_PRE,
-   HK_GS_VARIANT_PRE_NO_RAST,
 
    HK_GS_VARIANTS,
 };
@@ -200,11 +197,8 @@ enum hk_gs_variant {
 static const char *hk_gs_variant_name[] = {
    [HK_GS_VARIANT_RAST] = "Rasterization",
    [HK_GS_VARIANT_MAIN] = "Main",
-   [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)",
    [HK_GS_VARIANT_COUNT] = "Count",
-   [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)",
    [HK_GS_VARIANT_PRE] = "Pre-GS",
-   [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)",
 };
 /* clang-format on */
 
@@ -280,21 +274,21 @@ hk_any_variant(struct hk_api_shader *obj)
 }
 
 static struct hk_shader *
-hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_main_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_MAIN];
 }
 
 static struct hk_shader *
-hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_count_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_COUNT];
 }
 
 static struct hk_shader *
-hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_pre_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_PRE + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_PRE];
 }
 
 #define HK_MAX_LINKED_USC_SIZE                                                 \
diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c
index 864f7ed5545..f3acee12c6c 100644
--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@@ -37,15 +37,10 @@ agx_disk_cache_compute_key(struct disk_cache *cache,
    if (uncompiled->type == PIPE_SHADER_VERTEX ||
        uncompiled->type == PIPE_SHADER_TESS_EVAL)
       key_size = sizeof(shader_key->vs);
-   else if (uncompiled->type == PIPE_SHADER_GEOMETRY)
-      key_size = sizeof(shader_key->gs);
    else if (uncompiled->type == PIPE_SHADER_FRAGMENT)
       key_size = sizeof(shader_key->fs);
-   else if (uncompiled->type == PIPE_SHADER_COMPUTE ||
-            uncompiled->type == PIPE_SHADER_TESS_CTRL)
-      key_size = 0;
    else
-      unreachable("Unsupported shader stage");
+      key_size = 0;
 
    memcpy(data, uncompiled->nir_sha1, hash_size);
 
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index aaceaf7bd69..4de075a4056 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -199,6 +199,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
       return load_sysval_root(b, 1, 16, &u->no_epilog_discard);
    case nir_intrinsic_load_clip_z_coeff_agx:
       return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
+   case nir_intrinsic_load_rasterization_stream:
+      return nir_imm_int(b, 0);
    case nir_intrinsic_load_depth_never_agx:
       /* TODO: Do we need this workaround for anything in GL? */
       return nir_imm_intN_t(b, 0, 16);
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 910a64639c4..b8df7281fbc 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1386,7 +1386,6 @@ agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
 }
 
 DERIVE_HASH_TABLE(asahi_vs_shader_key);
-DERIVE_HASH_TABLE(asahi_gs_shader_key);
 DERIVE_HASH_TABLE(asahi_fs_shader_key);
 DERIVE_HASH_TABLE(agx_fast_link_key);
 
@@ -1593,10 +1592,8 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
       NIR_PASS(_, nir, agx_nir_lower_tcs);
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      struct asahi_gs_shader_key *key = &key_->gs;
-
-      NIR_PASS(_, nir, agx_nir_lower_gs, key->rasterizer_discard, &gs_count,
-               &gs_copy, &pre_gs, &gs_info);
+      NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
+               &gs_info);
    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       struct asahi_fs_shader_key *key = &key_->fs;
 
@@ -1724,11 +1721,7 @@ agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
    } else if (so->type == PIPE_SHADER_VERTEX ||
               so->type == PIPE_SHADER_TESS_EVAL) {
       memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
-   } else if (so->type == PIPE_SHADER_GEOMETRY) {
-      memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
    } else {
-      assert(gl_shader_stage_is_compute(so->type) ||
-             so->type == PIPE_SHADER_TESS_CTRL);
       /* No key */
    }
 
@@ -1918,9 +1911,8 @@ agx_create_shader_state(struct pipe_context *pctx,
        nir->info.stage == MESA_SHADER_TESS_EVAL) {
       so->variants = asahi_vs_shader_key_table_create(so);
       so->linked_shaders = agx_fast_link_key_table_create(so);
-   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      so->variants = asahi_gs_shader_key_table_create(so);
-   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL ||
+              nir->info.stage == MESA_SHADER_GEOMETRY) {
       /* No variants */
       so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
                                              asahi_cs_shader_key_equal);
@@ -1958,6 +1950,7 @@ agx_create_shader_state(struct pipe_context *pctx,
     * acceptable for now.
     */
    if ((so->type == PIPE_SHADER_TESS_CTRL) ||
+       (so->type == PIPE_SHADER_GEOMETRY) ||
        (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) {
       union asahi_shader_key key = {0};
       agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &key);
@@ -1975,9 +1968,6 @@ agx_create_shader_state(struct pipe_context *pctx,
       union asahi_shader_key key = {0};
 
       switch (so->type) {
-      case PIPE_SHADER_GEOMETRY:
-         break;
-
       case PIPE_SHADER_TESS_EVAL:
          /* TODO: Tessellation shaders with shader-db */
          return so;
@@ -2256,12 +2246,10 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
          tgt->stride = gs->xfb_strides[i];
    }
 
-   struct asahi_gs_shader_key key = {
-      .rasterizer_discard = ctx->rast->base.rasterizer_discard,
-   };
-
-   return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
-                            (union asahi_shader_key *)&key);
+   ctx->gs = _mesa_hash_table_next_entry(
+                ctx->stage[PIPE_SHADER_GEOMETRY].shader->variants, NULL)
+                ->data;
+   return true;
 }
 
 static enum pipe_blendfactor
@@ -5147,9 +5135,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       /* Launch the pre-rasterization parts of the geometry shader */
       agx_launch_gs_prerast(batch, info, draws, indirect);
 
-      if (ctx->rast->base.rasterizer_discard)
-         return;
-
       /* Setup to rasterize the GS results */
       struct agx_gs_info *gsi = &ctx->gs->gs;
       info_gs = (struct pipe_draw_info){
@@ -5271,6 +5256,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    out = (void *)agx_vdm_draw((uint32_t *)out, 0 /* ignored for now */, draw,
                               agx_primitive_for_pipe(info->mode));
 
+   /* Barrier transform feedback writes on themselves for consistency.
+    * This is the other half of agx_legalize_xfb.
+    */
+   if (ctx->gs && ctx->streamout.num_targets > 0) {
+      struct agx_device *dev = agx_device(ctx->base.screen);
+      out = (void *)agx_vdm_barrier((uint32_t *)out, dev->chip);
+   }
+
    batch->vdm.current = out;
    assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
           "Failed to reserve sufficient space in encoder");
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 5ae930d4af4..057d7036e2e 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -514,16 +514,8 @@ struct asahi_fs_shader_key {
 };
 static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");
 
-struct asahi_gs_shader_key {
-   /* If true, this GS is run only for its side effects (including XFB) */
-   bool rasterizer_discard;
-   bool padding[7];
-};
-static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
-
 union asahi_shader_key {
    struct asahi_vs_shader_key vs;
-   struct asahi_gs_shader_key gs;
    struct asahi_fs_shader_key fs;
 };