asahi,hk: significantly rework GS

get rid of the rasterizer discard variants, by pushing XFB into the hardware VS and letting everything cascade down from there. that then means hardware VS runs for all streams, which means we get dynamic rasterization stream selection. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35802>
2026-05-08 02:38:04 +02:00 · 2025-06-27 13:37:40 -04:00 · 2025-06-27 13:37:40 -04:00 · 1f7fe678e7
commit 1f7fe678e7
parent 03a5b7f25c
13 changed files with 575 additions and 618 deletions
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -38,6 +38,9 @@ struct agx_gs_info {
   /* Whether a prefix sum is required on the count outputs. Implies xfb */
   bool prefix_sum;

+   /* Whether the GS writes to a stream other than stream #0 */
+   bool multistream;
+
   /* Shape of the rasterization draw, named by the instance ID */
   enum agx_gs_shape shape;

@ -45,9 +48,9 @@ struct agx_gs_info {
   uint8_t topology[64];
 };

-bool agx_nir_lower_gs(struct nir_shader *gs, bool rasterizer_discard,
-                      struct nir_shader **gs_count, struct nir_shader **gs_copy,
-                      struct nir_shader **pre_gs, struct agx_gs_info *info);
+bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
+                      struct nir_shader **gs_copy, struct nir_shader **pre_gs,
+                      struct agx_gs_info *info);

 bool agx_nir_lower_tcs(struct nir_shader *tcs);

--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@ -13,11 +13,12 @@
 #include "query.h"
 #include "tessellator.h"

-/* Swap the two non-provoking vertices third vert in odd triangles. This
- * generates a vertex ID list with a consistent winding order.
+/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
+ * ID list with a consistent winding order.
 *
- * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
- * inverse. This lets us reuse it for both vertex fetch and transform feedback.
+ * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2]
+ * is its own inverse. It is hence used both vertex fetch and transform
+ * feedback.
 */
 uint
 libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
@ -30,12 +31,49 @@ libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
   return (provoking || even) ? vert : ((3 - pv) - vert);
 }

-uint64_t
-libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
-                          uint vert, uint buffer, uint stride,
-                          uint output_offset)
+static inline uint
+xfb_prim(uint id, uint n, uint copy)
+{
+   return sub_sat(id, n - 1u) + copy;
+}
+
+/*
+ * Determine whether an output vertex has an n'th copy in the transform feedback
+ * buffer. This is written weirdly to let constant folding remove unnecessary
+ * stores when length is known statically.
+ */
+bool
+libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
+{
+   uint prim = xfb_prim(id, n, copy);
+
+   int num_prims = length - (n - 1);
+   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
+}
+
+uint
+libagx_xfb_vertex_offset(uint n, uint invocation_base_prim,
+                         uint strip_base_prim, uint id_in_strip, uint copy,
+                         bool flatshade_first)
+{
+   uint prim = xfb_prim(id_in_strip, n, copy);
+   uint vert_0 = min(id_in_strip, n - 1);
+   uint vert = vert_0 - copy;
+
+   if (n == 3) {
+      vert = libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   /* Tally up in the whole buffer */
+   uint base_prim = invocation_base_prim + strip_base_prim;
+   uint base_vertex = base_prim * n;
+   return base_vertex + (prim * n) + vert;
+}
+
+uint64_t
+libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index,
+                          uint buffer, uint stride, uint output_offset)
 {
-   uint index = base_index + vert;
   uint xfb_offset = (index * stride) + output_offset;

   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
@ -572,20 +610,20 @@ libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i,
 }

 void
-libagx_end_primitive(global uint32_t *index_buffer, uint total_verts,
-                     uint verts_in_prim, uint total_prims, uint index_offs,
-                     uint geometry_base, bool restart)
+libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
+                   uint32_t prim_index_offset, uint32_t vertex_offset,
+                   uint32_t verts_in_prim, uint3 info)
 {
-   _libagx_end_primitive(index_buffer, total_verts, verts_in_prim, total_prims,
-                         index_offs, geometry_base, restart);
+   _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset,
+                       vertex_offset, verts_in_prim, info.x, info.y, info.z);
 }

 void
-libagx_pad_index_gs(global int *index_buffer, uint total_verts,
-                    uint total_prims, uint id, uint alloc)
+libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset,
+                    uint nr_indices, uint alloc)
 {
-   for (uint i = total_verts + total_prims; i < alloc; ++i) {
-      index_buffer[(id * alloc) + i] = -1;
+   for (uint i = nr_indices; i < alloc; ++i) {
+      index_buffer[inv_index_offset + i] = -1;
   }
 }

@ -888,7 +926,7 @@ libagx_pre_gs(global struct agx_geometry_params *p, uint streams,
      int4 overflow = prims < in_prims;

      libagx_foreach_xfb(streams, i) {
-         p->xfb_prims[i] = prims[i];
+         p->xfb_verts[i] = prims[i] * vertices_per_prim;

         *(p->xfb_overflow[i]) += (bool)overflow[i];
         *(p->xfb_prims_generated_counter[i]) += prims[i];
--- a/src/asahi/libagx/geometry.h
+++ b/src/asahi/libagx/geometry.h
@ -227,10 +227,10 @@ struct agx_geometry_params {

   uint32_t xfb_size[MAX_SO_BUFFERS];

-   /* Number of primitives emitted by transform feedback per stream. Written by
+   /* Number of vertices emitted by transform feedback per stream. Written by
    * the pre-GS program.
    */
-   uint32_t xfb_prims[MAX_VERTEX_STREAMS];
+   uint32_t xfb_verts[MAX_VERTEX_STREAMS];

   /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
    * out by the GS indirect setup kernel or the CPU for a direct draw. This is
@ -381,38 +381,26 @@ libagx_uncompact_prim(uint packed)
 }

 /*
- * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into
- * writes into the 32-bit output index buffer. We write the sequence (b, b + 1,
- * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
- * (count) is the number of verts in the prims, and -1 is the prim restart index
- * used to signal the end of the prim.
+ * Write a strip into a 32-bit index buffer. This is the sequence:
 *
- * For points, we write index buffers without restart, just as a sideband to
- * pass data into the vertex shader.
+ *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
+ *
+ * For points, we write index buffers without restart just for remapping.
 */
 static inline void
-_libagx_end_primitive(GLOBAL uint32_t *index_buffer, uint32_t total_verts,
-                      uint32_t verts_in_prim, uint32_t total_prims,
-                      uint32_t index_offs, uint32_t geometry_base, bool restart)
+_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
+                    uint32_t vertex_offset, uint32_t verts_in_prim,
+                    uint32_t stream, uint32_t stream_multiplier, uint32_t n)
 {
-   /* Previous verts/prims are from previous invocations plus earlier
-    * prims in this invocation. For the intra-invocation counts, we
-    * subtract the count for this prim from the inclusive sum NIR gives us.
-    */
-   uint32_t previous_verts_in_invoc = (total_verts - verts_in_prim);
-   uint32_t previous_verts = previous_verts_in_invoc;
-   uint32_t previous_prims = restart ? (total_prims - 1) : 0;
+   bool restart = n > 1;
+   if (verts_in_prim < n)
+      return;

-   /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
-   uint32_t index_base = geometry_base + previous_verts_in_invoc;
-
-   /* Index buffer contains 1 index for each vertex and 1 for each prim */
-   GLOBAL uint32_t *out =
-      &index_buffer[index_offs + previous_verts + previous_prims];
+   GLOBAL uint32_t *out = &index_buffer[index_offset];

   /* Write out indices for the strip */
   for (uint32_t i = 0; i < verts_in_prim; ++i) {
-      out[i] = index_base + i;
+      out[i] = (vertex_offset + i) * stream_multiplier + stream;
   }

   if (restart)
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@ -93,6 +93,9 @@ struct hk_root_descriptor_table {
         uint16_t api_gs;
         uint16_t _pad5;

+         uint16_t rasterization_stream;
+         uint16_t _pad6;
+
         /* Mapping from varying slots written by the last vertex stage to UVS
          * indices. This mapping must be compatible with the fragment shader.
          */
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -139,6 +139,22 @@ vk_conv_topology(VkPrimitiveTopology topology)
   }
 }

+static bool
+hk_rast_discard(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   /* A non-zero rasterization stream acts as a rasterizer discard unless
+    * there's a multistream geometry shader bound.
+    */
+   if (dyn->rs.rasterization_stream != 0) {
+      struct hk_api_shader *gs = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
+      return !gs || !gs->variants[HK_GS_VARIANT_COUNT].info.gs.multistream;
+   }
+
+   return dyn->rs.rasterizer_discard_enable;
+}
+
 static void
 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
 {
@ -1111,13 +1127,10 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 {
   struct hk_device *dev = hk_cmd_buffer_device(cmd);
   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
-   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
   struct hk_graphics_state *gfx = &cmd->state.gfx;
   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
-
-   bool rast_disc = dyn->rs.rasterizer_discard_enable;
-   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+   struct hk_shader *count = hk_count_gs_variant(gs);

   /* XXX: We should deduplicate this logic */
   bool indirect = agx_is_indirect(draw.b) ||
@ -1197,6 +1210,9 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
      params.vs_grid[2] = params.gs_grid[2] = 1;

      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+         /* Need to allocate heap if we haven't yet */
+         hk_heap(cmd);
+
         cmd->geom_index_buffer = dev->heap->va->addr;
         cmd->geom_index_count = dev->heap->size;
      } else {
@ -1455,13 +1471,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
   struct agx_grid grid_vs, grid_gs;

-   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
-   bool rast_disc = dyn->rs.rasterizer_discard_enable;
-
   struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
-   struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
-   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
-   struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
+   struct hk_shader *main = hk_main_gs_variant(gs);
+   struct hk_shader *count = hk_count_gs_variant(gs);
+   struct hk_shader *pre_gs = hk_pre_gs_variant(gs);

   uint64_t geometry_params = desc->root.draw.geometry_params;
   unsigned count_words = count->info.gs.count_words;
@ -1727,9 +1740,11 @@ hk_flush_shaders(struct hk_cmd_buffer *cmd)

   /* Geometry shading overrides the restart index, reemit on rebind */
   if (IS_SHADER_DIRTY(GEOMETRY)) {
+      struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
      struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];

      desc->root.draw.api_gs = gs && !gs->is_passthrough;
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
   }

   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
@ -2405,7 +2420,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
      agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
                          linked_fs->b.fragment_control) {

-         cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
+         cfg.tag_write_disable = hk_rast_discard(cmd);
      }
   }

@ -2496,7 +2511,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
         }

         cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
-         cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
+         cfg.rasterizer_discard = hk_rast_discard(cmd);

         /* We do not support unrestricted depth, so clamping is inverted from
          * clipping. This implementation seems to pass CTS without unrestricted
@ -2650,6 +2665,13 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
      desc->root_dirty = true;
   }

+   if (IS_DIRTY(RS_RASTERIZATION_STREAM)) {
+      desc->root.draw.rasterization_stream = dyn->rs.rasterization_stream;
+      desc->root_dirty = true;
+
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
+   }
+
   if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
       IS_DIRTY(DS_DEPTH_COMPARE_OP)) {

@ -3131,7 +3153,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,

      agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
         cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
-         cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
+         cfg.unknown_4 = cfg.unknown_5 = hk_rast_discard(cmd);
         cfg.generate_primitive_id = gfx->generate_primitive_id;
      }

@ -3562,14 +3584,6 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)

      if (geom) {
         draw = hk_launch_gs_prerast(cmd, ccs, draw);
-
-         /* We must not draw if the app specified rasterizer discard. This is
-          * required for both performance (it is pointless to rasterize and
-          * there are no side effects), but also correctness (no indirect draw
-          * descriptor will be filled out).
-          */
-         if (dyn->rs.rasterizer_discard_enable)
-            continue;
      }

      if (adj) {
--- a/src/asahi/vulkan/hk_nir_lower_descriptors.c
+++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c
@ -416,6 +416,9 @@ lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
   case nir_intrinsic_load_tess_param_buffer_agx:
      return lower_sysval_to_root_table(b, intrin, draw.tess_params);

+   case nir_intrinsic_load_rasterization_stream:
+      return lower_sysval_to_root_table(b, intrin, draw.rasterization_stream);
+
   case nir_intrinsic_load_is_first_fan_agx: {
      unsigned offset = hk_root_descriptor_offset(draw.provoking);
      b->cursor = nir_instr_remove(&intrin->instr);
--- a/src/asahi/vulkan/hk_shader.c
+++ b/src/asahi/vulkan/hk_shader.c
@ -1276,67 +1276,44 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,

   /* Compile all variants up front */
   if (sw_stage == MESA_SHADER_GEOMETRY) {
-      for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
-         struct hk_shader *main_variant = hk_main_gs_variant(obj, rast_disc);
-         struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
-         bool last = (rast_disc + 1) == 2;
+      struct hk_shader *main_variant = hk_main_gs_variant(obj);
+      struct hk_shader *count_variant = hk_count_gs_variant(obj);

-         /* Each variant gets its own NIR. To save an extra clone, we use the
-          * original NIR for the last stage.
-          */
-         nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
-         nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
+      nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;

-         NIR_PASS(_, clone, agx_nir_lower_gs, rast_disc, &count, &rast, &pre_gs,
-                  &count_variant->info.gs);
+      NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs,
+               &count_variant->info.gs);

-         if (!rast_disc) {
-            struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+      struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+      hk_lower_hw_vs(rast, shader, features);
+      shader->info.gs = count_variant->info.gs;
+      main_variant->info.gs = count_variant->info.gs;

-            hk_lower_hw_vs(rast, shader, features);
-            shader->info.gs = count_variant->info.gs;
-         }
+      struct {
+         nir_shader *in;
+         struct hk_shader *out;
+      } variants[] = {
+         {nir, hk_main_gs_variant(obj)},
+         {pre_gs, hk_pre_gs_variant(obj)},
+         {count, count_variant},
+         {rast, &obj->variants[HK_GS_VARIANT_RAST]},
+      };

-         main_variant->info.gs = count_variant->info.gs;
+      for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
+         if (variants[v].in) {
+            result = hk_compile_nir(
+               dev, pAllocator, variants[v].in, info->flags, info->robustness,
+               NULL, features, variants[v].out, sw_stage, true, NULL);

-         struct {
-            nir_shader *in;
-            struct hk_shader *out;
-         } variants[] = {
-            {clone, hk_main_gs_variant(obj, rast_disc)},
-            {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
-            {count, count_variant},
-            {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
-         };
-
-         for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
-            if (variants[v].in) {
-               result =
-                  hk_compile_nir(dev, pAllocator, variants[v].in, info->flags,
-                                 info->robustness, NULL, features,
-                                 variants[v].out, sw_stage, true, NULL);
-               if (result != VK_SUCCESS) {
-                  hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
-                  if (clone != nir) {
-                     ralloc_free(nir);
-                  }
-
-                  ralloc_free(clone);
-                  ralloc_free(pre_gs);
-                  ralloc_free(count);
-                  ralloc_free(rast);
-                  return result;
-               }
+            if (result != VK_SUCCESS) {
+               hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+               ralloc_free(nir);
+               ralloc_free(pre_gs);
+               ralloc_free(count);
+               ralloc_free(rast);
+               return result;
            }
         }
-
-         /* Nothing consumes this otherwise throw it away.
-          *
-          * TODO: We should just not generate it.
-          */
-         if (rast_disc) {
-            ralloc_free(rast);
-         }
      }
   } else if (sw_stage == MESA_SHADER_VERTEX ||
              sw_stage == MESA_SHADER_TESS_EVAL) {
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@ -183,15 +183,12 @@ enum hk_gs_variant {

   /* Main compute shader */
   HK_GS_VARIANT_MAIN,
-   HK_GS_VARIANT_MAIN_NO_RAST,

   /* Count compute shader */
   HK_GS_VARIANT_COUNT,
-   HK_GS_VARIANT_COUNT_NO_RAST,

   /* Pre-GS compute shader */
   HK_GS_VARIANT_PRE,
-   HK_GS_VARIANT_PRE_NO_RAST,

   HK_GS_VARIANTS,
 };
@ -200,11 +197,8 @@ enum hk_gs_variant {
 static const char *hk_gs_variant_name[] = {
   [HK_GS_VARIANT_RAST] = "Rasterization",
   [HK_GS_VARIANT_MAIN] = "Main",
-   [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)",
   [HK_GS_VARIANT_COUNT] = "Count",
-   [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)",
   [HK_GS_VARIANT_PRE] = "Pre-GS",
-   [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)",
 };
 /* clang-format on */

@ -280,21 +274,21 @@ hk_any_variant(struct hk_api_shader *obj)
 }

 static struct hk_shader *
-hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_main_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_MAIN];
 }

 static struct hk_shader *
-hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_count_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_COUNT];
 }

 static struct hk_shader *
-hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+hk_pre_gs_variant(struct hk_api_shader *obj)
 {
-   return &obj->variants[HK_GS_VARIANT_PRE + rast_disc];
+   return &obj->variants[HK_GS_VARIANT_PRE];
 }

 #define HK_MAX_LINKED_USC_SIZE                                                 \
--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@ -37,15 +37,10 @@ agx_disk_cache_compute_key(struct disk_cache *cache,
   if (uncompiled->type == PIPE_SHADER_VERTEX ||
       uncompiled->type == PIPE_SHADER_TESS_EVAL)
      key_size = sizeof(shader_key->vs);
-   else if (uncompiled->type == PIPE_SHADER_GEOMETRY)
-      key_size = sizeof(shader_key->gs);
   else if (uncompiled->type == PIPE_SHADER_FRAGMENT)
      key_size = sizeof(shader_key->fs);
-   else if (uncompiled->type == PIPE_SHADER_COMPUTE ||
-            uncompiled->type == PIPE_SHADER_TESS_CTRL)
-      key_size = 0;
   else
-      unreachable("Unsupported shader stage");
+      key_size = 0;

   memcpy(data, uncompiled->nir_sha1, hash_size);

--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -199,6 +199,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
      return load_sysval_root(b, 1, 16, &u->no_epilog_discard);
   case nir_intrinsic_load_clip_z_coeff_agx:
      return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
+   case nir_intrinsic_load_rasterization_stream:
+      return nir_imm_int(b, 0);
   case nir_intrinsic_load_depth_never_agx:
      /* TODO: Do we need this workaround for anything in GL? */
      return nir_imm_intN_t(b, 0, 16);
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -1386,7 +1386,6 @@ agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
 }

 DERIVE_HASH_TABLE(asahi_vs_shader_key);
-DERIVE_HASH_TABLE(asahi_gs_shader_key);
 DERIVE_HASH_TABLE(asahi_fs_shader_key);
 DERIVE_HASH_TABLE(agx_fast_link_key);

@ -1593,10 +1592,8 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
      NIR_PASS(_, nir, agx_nir_lower_tcs);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      struct asahi_gs_shader_key *key = &key_->gs;
-
-      NIR_PASS(_, nir, agx_nir_lower_gs, key->rasterizer_discard, &gs_count,
-               &gs_copy, &pre_gs, &gs_info);
+      NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
+               &gs_info);
   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      struct asahi_fs_shader_key *key = &key_->fs;

@ -1724,11 +1721,7 @@ agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
   } else if (so->type == PIPE_SHADER_VERTEX ||
              so->type == PIPE_SHADER_TESS_EVAL) {
      memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
-   } else if (so->type == PIPE_SHADER_GEOMETRY) {
-      memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
   } else {
-      assert(gl_shader_stage_is_compute(so->type) ||
-             so->type == PIPE_SHADER_TESS_CTRL);
      /* No key */
   }

@ -1918,9 +1911,8 @@ agx_create_shader_state(struct pipe_context *pctx,
       nir->info.stage == MESA_SHADER_TESS_EVAL) {
      so->variants = asahi_vs_shader_key_table_create(so);
      so->linked_shaders = agx_fast_link_key_table_create(so);
-   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      so->variants = asahi_gs_shader_key_table_create(so);
-   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL ||
+              nir->info.stage == MESA_SHADER_GEOMETRY) {
      /* No variants */
      so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
                                             asahi_cs_shader_key_equal);
@ -1958,6 +1950,7 @@ agx_create_shader_state(struct pipe_context *pctx,
    * acceptable for now.
    */
   if ((so->type == PIPE_SHADER_TESS_CTRL) ||
+       (so->type == PIPE_SHADER_GEOMETRY) ||
       (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) {
      union asahi_shader_key key = {0};
      agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &key);
@ -1975,9 +1968,6 @@ agx_create_shader_state(struct pipe_context *pctx,
      union asahi_shader_key key = {0};

      switch (so->type) {
-      case PIPE_SHADER_GEOMETRY:
-         break;
-
      case PIPE_SHADER_TESS_EVAL:
         /* TODO: Tessellation shaders with shader-db */
         return so;
@ -2256,12 +2246,10 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
         tgt->stride = gs->xfb_strides[i];
   }

-   struct asahi_gs_shader_key key = {
-      .rasterizer_discard = ctx->rast->base.rasterizer_discard,
-   };
-
-   return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
-                            (union asahi_shader_key *)&key);
+   ctx->gs = _mesa_hash_table_next_entry(
+                ctx->stage[PIPE_SHADER_GEOMETRY].shader->variants, NULL)
+                ->data;
+   return true;
 }

 static enum pipe_blendfactor
@ -5147,9 +5135,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      /* Launch the pre-rasterization parts of the geometry shader */
      agx_launch_gs_prerast(batch, info, draws, indirect);

-      if (ctx->rast->base.rasterizer_discard)
-         return;
-
      /* Setup to rasterize the GS results */
      struct agx_gs_info *gsi = &ctx->gs->gs;
      info_gs = (struct pipe_draw_info){
@ -5271,6 +5256,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   out = (void *)agx_vdm_draw((uint32_t *)out, 0 /* ignored for now */, draw,
                              agx_primitive_for_pipe(info->mode));

+   /* Barrier transform feedback writes on themselves for consistency.
+    * This is the other half of agx_legalize_xfb.
+    */
+   if (ctx->gs && ctx->streamout.num_targets > 0) {
+      struct agx_device *dev = agx_device(ctx->base.screen);
+      out = (void *)agx_vdm_barrier((uint32_t *)out, dev->chip);
+   }
+
   batch->vdm.current = out;
   assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
          "Failed to reserve sufficient space in encoder");
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -514,16 +514,8 @@ struct asahi_fs_shader_key {
 };
 static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");

-struct asahi_gs_shader_key {
-   /* If true, this GS is run only for its side effects (including XFB) */
-   bool rasterizer_discard;
-   bool padding[7];
-};
-static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
-
 union asahi_shader_key {
   struct asahi_vs_shader_key vs;
-   struct asahi_gs_shader_key gs;
   struct asahi_fs_shader_key fs;
 };