diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 1df5c537e3f..4471ce0076c 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -97,6 +97,12 @@ typedef struct
    nir_variable *clip_vertex_var;
    nir_variable *clipdist_neg_mask_var;
    bool has_clipdist;
+
+   /* outputs */
+   nir_ssa_def *outputs[VARYING_SLOT_MAX][4];
+   nir_ssa_def *outputs_16bit_lo[16][4];
+   nir_ssa_def *outputs_16bit_hi[16][4];
+   shader_output_types output_types;
 } lower_ngg_nogs_state;
 
 typedef struct
@@ -599,6 +605,9 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *st)
    nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32),
                     .base = st->options->primitive_id_location,
                     .src_type = nir_type_uint32, .io_semantics = io_sem);
+
+   /* Update outputs_written to reflect that the pass added a new output. */
+   b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID;
 }
 
 static void
@@ -1614,66 +1623,111 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
       unreachable("Should be VS or TES.");
 }
 
-static bool
-do_ngg_nogs_store_output_to_lds(nir_builder *b, nir_instr *instr, void *state)
+static void
+ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
 {
-   lower_ngg_nogs_state *st = (lower_ngg_nogs_state *)state;
+   if (!s->outputs[VARYING_SLOT_EDGE][0])
+      return;
 
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-   if (intrin->intrinsic != nir_intrinsic_store_output)
-      return false;
-
-   /* no indirect output */
-   assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
-
-   b->cursor = nir_before_instr(instr);
-
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
-   unsigned component = nir_intrinsic_component(intrin);
-   unsigned write_mask = nir_intrinsic_write_mask(intrin);
-   nir_ssa_def *store_val = intrin->src[0].ssa;
-
-   if (sem.location == VARYING_SLOT_EDGE) {
-      if (st->has_user_edgeflags) {
-         /* clamp user edge flag to 1 for latter bit operations */
-         store_val = nir_umin(b, store_val, nir_imm_int(b, 1));
-         /* remove instr after cursor point to the new node */
-         nir_instr_remove(instr);
-      } else {
-         /* remove the edge flag output anyway as it should not be passed to next stage */
-         nir_instr_remove(instr);
-         return true;
-      }
-   } else {
-      write_mask = nir_instr_xfb_write_mask(intrin) >> component;
-      if (!(write_mask && st->streamout_enabled))
-         return false;
-   }
+   /* clamp user edge flag to 1 for latter bit operations */
+   nir_ssa_def *edgeflag = s->outputs[VARYING_SLOT_EDGE][0];
+   edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
 
    /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
    unsigned offset = 0;
-   if (st->streamout_enabled) {
+   if (s->streamout_enabled) {
       unsigned packed_location =
-         util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(sem.location));
-      offset = packed_location * 16 + component * 4;
+         util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
+      offset = packed_location * 16;
    }
 
    nir_ssa_def *tid = nir_load_local_invocation_index(b);
-   nir_ssa_def *addr = pervertex_lds_addr(b, tid, st->pervertex_lds_bytes);
+   nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
 
-   nir_store_shared(b, store_val, addr, .base = offset, .write_mask = write_mask);
-
-   return true;
+   nir_store_shared(b, edgeflag, addr, .base = offset);
 }
 
 static void
-ngg_nogs_store_all_outputs_to_lds(nir_shader *shader, lower_ngg_nogs_state *st)
+ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
 {
-   nir_shader_instructions_pass(shader, do_ngg_nogs_store_output_to_lds,
-                                nir_metadata_block_index | nir_metadata_dominance, st);
+   nir_xfb_info *info = b->shader->xfb_info;
+
+   uint64_t xfb_outputs = 0;
+   unsigned xfb_outputs_16bit = 0;
+   uint8_t xfb_mask[VARYING_SLOT_MAX] = {0};
+   uint8_t xfb_mask_16bit_lo[16] = {0};
+   uint8_t xfb_mask_16bit_hi[16] = {0};
+
+   /* Get XFB output mask for each slot. */
+   for (int i = 0; i < info->output_count; i++) {
+      nir_xfb_output_info *out = info->outputs + i;
+
+      if (out->location < VARYING_SLOT_VAR0_16BIT) {
+         xfb_outputs |= BITFIELD64_BIT(out->location);
+         xfb_mask[out->location] |= out->component_mask;
+      } else {
+         unsigned index = out->location - VARYING_SLOT_VAR0_16BIT;
+         xfb_outputs_16bit |= BITFIELD_BIT(index);
+
+         if (out->high_16bits)
+            xfb_mask_16bit_hi[index] |= out->component_mask;
+         else
+            xfb_mask_16bit_lo[index] |= out->component_mask;
+      }
+   }
+
+   nir_ssa_def *tid = nir_load_local_invocation_index(b);
+   nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
+
+   u_foreach_bit64(slot, xfb_outputs) {
+      unsigned packed_location =
+         util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(slot));
+
+      unsigned mask = xfb_mask[slot];
+      while (mask) {
+         int start, count;
+         u_bit_scan_consecutive_range(&mask, &start, &count);
+         /* Outputs here are sure to be 32bit.
+          *
+          * 64bit outputs have been lowered to two 32bit. As 16bit outputs:
+          *   Vulkan does not allow streamout outputs less than 32bit.
+          *   OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
+          */
+         nir_ssa_def *store_val = nir_vec(b, &s->outputs[slot][start], (unsigned)count);
+         nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+      }
+   }
+
+   unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
+   u_foreach_bit64(slot, xfb_outputs_16bit) {
+      unsigned packed_location = num_32bit_outputs +
+         util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
+
+      unsigned mask_lo = xfb_mask_16bit_lo[slot];
+      unsigned mask_hi = xfb_mask_16bit_hi[slot];
+
+      nir_ssa_def **outputs_lo = s->outputs_16bit_lo[slot];
+      nir_ssa_def **outputs_hi = s->outputs_16bit_hi[slot];
+      nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
+
+      unsigned mask = mask_lo | mask_hi;
+      while (mask) {
+         int start, count;
+         u_bit_scan_consecutive_range(&mask, &start, &count);
+
+         nir_ssa_def *values[4] = {0};
+         for (int c = start; c < start + count; ++c) {
+            nir_ssa_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
+            nir_ssa_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
+
+            /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
+            values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
+         }
+
+         nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
+         nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+      }
+   }
 }
 
 static void
@@ -1937,24 +1991,17 @@ ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
    return pervertex_lds_bytes;
 }
 
-static unsigned
-gather_vs_outputs(nir_builder *b, struct exec_list *cf_list, vs_output *outputs,
-                  const uint8_t *vs_output_param_offset)
+static void
+ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
 {
-   uint64_t output_mask32 = 0;
-   nir_ssa_def *outputs32[VARYING_SLOT_MAX][4] = {0};
-
-   unsigned output_mask16_lo = 0;
-   unsigned output_mask16_hi = 0;
-   nir_ssa_def *outputs16_lo[16][4];
-   nir_ssa_def *outputs16_hi[16][4];
-
    /* Assume:
     * - the shader used nir_lower_io_to_temporaries
     * - 64-bit outputs are lowered
     * - no indirect indexing is present
     */
-   struct nir_cf_node *first_node = exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
+   struct nir_cf_node *first_node =
+      exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
+
    for (nir_block *block = nir_cf_node_cf_tree_first(first_node); block != NULL;
         block = nir_block_cf_tree_next(block)) {
       nir_foreach_instr_safe (instr, block) {
@@ -1967,61 +2014,92 @@ gather_vs_outputs(nir_builder *b, struct exec_list *cf_list, vs_output *outputs,
 
          assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
 
-         unsigned slot = nir_intrinsic_io_semantics(intrin).location;
-         if (vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
-            continue;
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
+         unsigned slot = sem.location;
 
-         bool is_hi = nir_intrinsic_io_semantics(intrin).high_16bits;
-         bool is_16bit = slot >= VARYING_SLOT_VAR0_16BIT;
-
-         u_foreach_bit (i, nir_intrinsic_write_mask(intrin)) {
-            unsigned comp = nir_intrinsic_component(intrin) + i;
-            nir_ssa_def *chan = nir_channel(b, intrin->src[0].ssa, i);
-            if (is_16bit && is_hi)
-               outputs16_hi[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
-            else if (is_16bit)
-               outputs16_lo[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
-            else
-               outputs32[slot][comp] = chan;
+         nir_ssa_def **output;
+         nir_alu_type *type;
+         if (slot >= VARYING_SLOT_VAR0_16BIT) {
+            unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
+            if (sem.high_16bits) {
+               output = s->outputs_16bit_hi[index];
+               type = s->output_types.types_16bit_hi[index];
+            } else {
+               output = s->outputs_16bit_lo[index];
+               type = s->output_types.types_16bit_lo[index];
+            }
+         } else {
+            output = s->outputs[slot];
+            type = s->output_types.types[slot];
          }
 
-         if (is_16bit && is_hi)
-            output_mask16_hi |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
-         else if (is_16bit)
-            output_mask16_lo |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
-         else
-            output_mask32 |= BITFIELD64_BIT(slot);
+         unsigned component = nir_intrinsic_component(intrin);
+         unsigned write_mask = nir_intrinsic_write_mask(intrin);
+         nir_alu_type src_type = nir_intrinsic_src_type(intrin);
 
-         if (slot >= VARYING_SLOT_VAR0 || !(BITFIELD64_BIT(slot) & POS_EXPORT_MASK))
-            nir_instr_remove(&intrin->instr);
+         u_foreach_bit (i, write_mask) {
+            unsigned c = component + i;
+            output[c] = nir_channel(b, intrin->src[0].ssa, i);
+            type[c] = src_type;
+         }
+
+         /* remove the edge flag output anyway as it should not be passed to next stage */
+         bool is_edge_slot = slot == VARYING_SLOT_EDGE;
+         /* remove non-pos-export slot when GFX11, they are written to buffer memory */
+         bool is_pos_export_slot = slot < VARYING_SLOT_MAX && (BITFIELD64_BIT(slot) & POS_EXPORT_MASK);
+         if (is_edge_slot || (s->options->gfx_level >= GFX11 && !is_pos_export_slot))
+            nir_instr_remove(instr);
       }
    }
+}
 
+static unsigned
+gather_vs_outputs(nir_builder *b, vs_output *outputs, lower_ngg_nogs_state *s)
+{
    unsigned num_outputs = 0;
-   u_foreach_bit64 (i, output_mask32) {
-      outputs[num_outputs].slot = i;
-      for (unsigned j = 0; j < 4; j++) {
-         nir_ssa_def *chan = outputs32[i][j];
+   u_foreach_bit64 (slot, b->shader->info.outputs_written) {
+      if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
+         continue;
+
+      /* skip output if no one written before */
+      if (!s->outputs[slot][0] && !s->outputs[slot][1] &&
+          !s->outputs[slot][2] && !s->outputs[slot][3])
+         continue;
+
+      outputs[num_outputs].slot = slot;
+      for (int i = 0; i < 4; i++) {
+         nir_ssa_def *chan = s->outputs[slot][i];
          /* RADV implements 16-bit outputs as 32-bit with VARYING_SLOT_VAR0-31. */
-         outputs[num_outputs].chan[j] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
+         outputs[num_outputs].chan[i] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
       }
       num_outputs++;
    }
 
-   if (output_mask16_lo | output_mask16_hi) {
-      nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
-      u_foreach_bit (i, output_mask16_lo | output_mask16_hi) {
-         vs_output *output = &outputs[num_outputs++];
+   u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
+      unsigned slot = VARYING_SLOT_VAR0_16BIT + i;
+      if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
+         continue;
 
-         output->slot = i + VARYING_SLOT_VAR0_16BIT;
-         for (unsigned j = 0; j < 4; j++) {
-            nir_ssa_def *lo = output_mask16_lo & BITFIELD_BIT(i) ? outputs16_lo[i][j] : NULL;
-            nir_ssa_def *hi = output_mask16_hi & BITFIELD_BIT(i) ? outputs16_hi[i][j] : NULL;
-            if (lo || hi)
-               output->chan[j] = nir_pack_32_2x16_split(b, lo ? lo : undef, hi ? hi : undef);
-            else
-               output->chan[j] = NULL;
-         }
+      /* skip output if no one written before */
+      if (!s->outputs_16bit_lo[i][0] && !s->outputs_16bit_lo[i][1] &&
+          !s->outputs_16bit_lo[i][2] && !s->outputs_16bit_lo[i][3] &&
+          !s->outputs_16bit_hi[i][0] && !s->outputs_16bit_hi[i][1] &&
+          !s->outputs_16bit_hi[i][2] && !s->outputs_16bit_hi[i][3])
+         continue;
+
+      vs_output *output = &outputs[num_outputs++];
+      output->slot = slot;
+
+      nir_ssa_def **output_lo = s->outputs_16bit_lo[i];
+      nir_ssa_def **output_hi = s->outputs_16bit_hi[i];
+      nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
+      for (int j = 0; j < 4; j++) {
+         nir_ssa_def *lo = output_lo[j] ? output_lo[j] : undef;
+         nir_ssa_def *hi = output_hi[j] ? output_hi[j] : undef;
+         if (output_lo[j] || output_hi[j])
+            output->chan[j] = nir_pack_32_2x16_split(b, lo, hi);
+         else
+            output->chan[j] = NULL;
       }
    }
 
@@ -2225,44 +2303,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
    }
    nir_pop_if(b, if_es_thread);
 
-   if (state.streamout_enabled) {
-      /* TODO: support culling after streamout. */
-      assert(!options->can_cull);
-
-      ngg_nogs_build_streamout(b, &state);
-   }
-
-   if (state.streamout_enabled || has_user_edgeflags) {
-      ngg_nogs_store_all_outputs_to_lds(shader, &state);
-      b->cursor = nir_after_cf_list(&impl->body);
-   }
-
-   /* Take care of late primitive export */
-   if (!state.early_prim_export) {
-      emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
-   }
-
-   /* Export varyings for GFX11+ */
-   if (state.options->gfx_level >= GFX11) {
-      vs_output outputs[64];
-
-      b->cursor = nir_after_cf_list(&if_es_thread->then_list);
-      unsigned num_outputs =
-         gather_vs_outputs(b, &if_es_thread->then_list, outputs, options->vs_output_param_offset);
-
-      if (num_outputs) {
-         b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
-         create_vertex_param_phis(b, num_outputs, outputs);
-
-         b->cursor = nir_after_cf_list(&impl->body);
-
-         if (!num_es_threads)
-            num_es_threads = nir_load_merged_wave_info_amd(b);
-         export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs,
-                                    options->vs_output_param_offset);
-      }
-   }
-
    if (options->can_cull) {
       /* Replace uniforms. */
       apply_reusable_variables(b, &state);
@@ -2279,7 +2319,50 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
       nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
       nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
       nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = state.position_store_base,
-                       .component = 0, .io_semantics = io_sem);
+                       .component = 0, .io_semantics = io_sem, .src_type = nir_type_float32);
+   }
+
+   /* Gather outputs data and types */
+   b->cursor = nir_after_cf_list(&if_es_thread->then_list);
+   ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
+
+   if (state.has_user_edgeflags)
+      ngg_nogs_store_edgeflag_to_lds(b, &state);
+
+   if (state.streamout_enabled) {
+      /* TODO: support culling after streamout. */
+      assert(!options->can_cull);
+
+      ngg_nogs_store_xfb_outputs_to_lds(b, &state);
+
+      b->cursor = nir_after_cf_list(&impl->body);
+      ngg_nogs_build_streamout(b, &state);
+   }
+
+   /* Take care of late primitive export */
+   if (!state.early_prim_export) {
+      b->cursor = nir_after_cf_list(&impl->body);
+      emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
+   }
+
+   /* Export varyings for GFX11+ */
+   if (state.options->gfx_level >= GFX11) {
+      vs_output outputs[64];
+
+      b->cursor = nir_after_cf_list(&if_es_thread->then_list);
+      unsigned num_outputs = gather_vs_outputs(b, outputs, &state);
+
+      if (num_outputs) {
+         b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
+         create_vertex_param_phis(b, num_outputs, outputs);
+
+         b->cursor = nir_after_cf_list(&impl->body);
+
+         if (!num_es_threads)
+            num_es_threads = nir_load_merged_wave_info_amd(b);
+         export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs,
+                                    options->vs_output_param_offset);
+      }
    }
 
    nir_metadata_preserve(impl, nir_metadata_none);