diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 3ceb705258b..b81b65f18c7 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -63,6 +63,7 @@ typedef struct
    bool use_edgeflags;
    bool has_prim_query;
    bool streamout_enabled;
+   bool has_user_edgeflags;
    unsigned wave_size;
    unsigned max_num_waves;
    unsigned num_vertices_per_primitives;
@@ -453,6 +454,34 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def
       if (!arg)
          arg = emit_ngg_nogs_prim_exp_arg(b, st);
 
+      /* pack user edge flag info into arg */
+      if (st->has_user_edgeflags) {
+         /* Workgroup barrier: wait for ES threads store user edge flags to LDS */
+         nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP,
+                            .memory_scope = NIR_SCOPE_WORKGROUP,
+                            .memory_semantics = NIR_MEMORY_ACQ_REL,
+                            .memory_modes = nir_var_mem_shared);
+
+         unsigned edge_flag_bits = (1u << 9) | (1u << 19) | (1u << 29);
+         nir_ssa_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32);
+
+         unsigned edge_flag_offset = 0;
+         if (st->streamout_enabled) {
+            unsigned packed_location =
+               util_bitcount64(b->shader->info.outputs_written &
+                               BITFIELD64_MASK(VARYING_SLOT_EDGE));
+            edge_flag_offset = packed_location * 16;
+         }
+
+         for (int i = 0; i < st->num_vertices_per_primitives; i++) {
+            nir_ssa_def *vtx_idx = nir_load_var(b, st->gs_vtx_indices_vars[i]);
+            nir_ssa_def *addr = pervertex_lds_addr(b, vtx_idx, st->pervertex_lds_bytes);
+            nir_ssa_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset);
+            mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 9 + i * 10));
+         }
+         arg = nir_iand(b, arg, mask);
+      }
+
       if (st->has_prim_query) {
          nir_if *if_shader_query = nir_push_if(b, nir_load_prim_gen_query_enabled_amd(b));
          {
@@ -1521,23 +1550,42 @@ do_ngg_nogs_store_output_to_lds(nir_builder *b, nir_instr *instr, void *state)
    if (intrin->intrinsic != nir_intrinsic_store_output)
       return false;
 
-   unsigned component = nir_intrinsic_component(intrin);
-   unsigned write_mask = nir_instr_xfb_write_mask(intrin) >> component;
-   if (!write_mask)
-      return false;
-
    b->cursor = nir_before_instr(instr);
 
-   unsigned base_offset = nir_src_as_uint(intrin->src[1]);
-   unsigned location = nir_intrinsic_io_semantics(intrin).location + base_offset;
-   unsigned packed_location =
-      util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(location));
-   unsigned offset = packed_location * 16 + component * 4;
+   unsigned component = nir_intrinsic_component(intrin);
+   unsigned write_mask = nir_intrinsic_write_mask(intrin);
+   nir_ssa_def *store_val = intrin->src[0].ssa;
+
+   if (nir_intrinsic_io_semantics(intrin).location == VARYING_SLOT_EDGE) {
+      if (st->has_user_edgeflags) {
+         /* clamp user edge flag to 1 for latter bit operations */
+         store_val = nir_umin(b, store_val, nir_imm_int(b, 1));
+         /* remove instr after cursor point to the new node */
+         nir_instr_remove(instr);
+      } else {
+         /* remove the edge flag output anyway as it should not be passed to next stage */
+         nir_instr_remove(instr);
+         return true;
+      }
+   } else {
+      write_mask = nir_instr_xfb_write_mask(intrin) >> component;
+      if (!(write_mask && st->streamout_enabled))
+         return false;
+   }
+
+   /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
+   unsigned offset = 0;
+   if (st->streamout_enabled) {
+      unsigned base_offset = nir_src_as_uint(intrin->src[1]);
+      unsigned location = nir_intrinsic_io_semantics(intrin).location + base_offset;
+      unsigned packed_location =
+         util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(location));
+      offset = packed_location * 16 + component * 4;
+   }
 
    nir_ssa_def *tid = nir_load_local_invocation_index(b);
    nir_ssa_def *addr = pervertex_lds_addr(b, tid, st->pervertex_lds_bytes);
 
-   nir_ssa_def *store_val = intrin->src[0].ssa;
    nir_store_shared(b, store_val, addr, .base = offset, .write_mask = write_mask);
 
    return true;
@@ -1821,11 +1869,16 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    nir_variable *gs_accepted_var = can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL;
 
    bool streamout_enabled = shader->xfb_info && !disable_streamout;
+   bool has_user_edgeflags = use_edgeflags && (shader->info.outputs_written & VARYING_BIT_EDGE);
    /* streamout need to be done before either prim or vertex export. Because when no
     * param export, rasterization can start right after prim and vertex export,
     * which left streamout buffer writes un-finished.
+    *
+    * Always use late prim export when user edge flags are enabled.
+    * This is because edge flags are written by ES threads but they
+    * are exported by GS threads as part of th primitive export.
     */
-   if (streamout_enabled)
+   if (streamout_enabled || has_user_edgeflags)
       early_prim_export = false;
 
    lower_ngg_nogs_state state = {
@@ -1846,6 +1899,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       .instance_rate_inputs = instance_rate_inputs,
       .clipdist_enable_mask = clipdist_enable_mask,
       .user_clip_plane_enable_mask = user_clip_plane_enable_mask,
+      .has_user_edgeflags = has_user_edgeflags,
    };
 
    const bool need_prim_id_store_shared =
@@ -1902,7 +1956,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
          emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, state.prim_exp_arg_var));
 
       /* Wait for culling to finish using LDS. */
-      if (need_prim_id_store_shared) {
+      if (need_prim_id_store_shared || has_user_edgeflags) {
          nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP,
                                .memory_scope = NIR_SCOPE_WORKGROUP,
                                .memory_semantics = NIR_MEMORY_ACQ_REL,
@@ -1916,14 +1970,20 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
        * TODO: only alloc space for outputs that really need streamout.
        */
       state.pervertex_lds_bytes = (shader->num_outputs * 4 + 1) * 4;
-   } else if (need_prim_id_store_shared)
-      state.pervertex_lds_bytes = 4;
+   } else if (need_prim_id_store_shared || state.has_user_edgeflags) {
+      if (need_prim_id_store_shared)
+         state.pervertex_lds_bytes += 4;
+      if (state.has_user_edgeflags)
+         state.pervertex_lds_bytes += 4;
+
+      /* pad to odd dwords to avoid LDS bank conflict */
+      state.pervertex_lds_bytes |= 4;
 
-   if (need_prim_id_store_shared) {
-      /* We need LDS space when VS needs to export the primitive ID. */
       state.total_lds_bytes = MAX2(state.total_lds_bytes,
                                    state.pervertex_lds_bytes * max_num_es_vertices);
+   }
 
+   if (need_prim_id_store_shared) {
       emit_ngg_nogs_prim_id_store_shared(b, &state);
 
       /* Wait for GS threads to store primitive ID in LDS. */
@@ -1956,7 +2016,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    }
 
    /* streamout may be disabled by ngg_nogs_build_streamout() */
-   if (state.streamout_enabled) {
+   if (state.streamout_enabled || has_user_edgeflags) {
       ngg_nogs_store_all_outputs_to_lds(shader, &state);
       b->cursor = nir_after_cf_list(&impl->body);
    }