diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 57ca9292b86..e739c83c153 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -134,6 +134,7 @@ typedef struct {
    bool kill_pointsize;
    bool kill_layer;
    bool force_vrs;
+   bool compact_primitives;
 
    /* VS */
    unsigned num_vertices_per_primitive;
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index d4f97728d9e..d1c361165ff 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -983,6 +983,7 @@ cleanup_culling_shader_after_dce(nir_shader *shader,
  * 3. Emit GS_ALLOC_REQ
  * 4. Repacked invocations load the vertex data from LDS
  * 5. GS threads update their vertex indices
+ * 6. Optionally, do the same for primitives.
  */
 static void
 compact_vertices_after_culling(nir_builder *b,
@@ -993,6 +994,8 @@ compact_vertices_after_culling(nir_builder *b,
                                nir_def *es_vertex_lds_addr,
                                nir_def *es_exporter_tid,
                                nir_def *num_live_vertices_in_workgroup,
+                               nir_def *gs_exporter_tid,
+                               nir_def *num_live_primitives_in_workgroup,
                                unsigned pervertex_lds_bytes,
                                unsigned num_repacked_variables)
 {
@@ -1066,7 +1069,8 @@ compact_vertices_after_culling(nir_builder *b,
    }
    nir_pop_if(b, if_packed_es_thread);
 
-   nir_if *if_gs_accepted = nir_push_if(b, nir_load_var(b, gs_accepted_var));
+   nir_def *gs_accepted = nir_load_var(b, gs_accepted_var);
+   nir_if *if_gs_accepted = nir_push_if(b, gs_accepted);
    {
       nir_def *exporter_vtx_indices[3] = {0};
 
@@ -1086,6 +1090,46 @@ compact_vertices_after_culling(nir_builder *b,
    nir_pop_if(b, if_gs_accepted);
 
    nir_store_var(b, es_accepted_var, es_survived, 0x1u);
+
+   if (s->options->compact_primitives) {
+      /* For primitive compaction, re-use the same LDS space that we used for
+       * vertex compaction, so we need to wait until vertex threads are finished reading it.
+       * Considering we only need 1 DWORD per primitive, let's assume we always have enough space,
+       * since vertex compaction requires at least 5 DWORDs per vertex.
+       */
+      nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
+                     .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
+
+      if_gs_accepted = nir_push_if(b, gs_accepted);
+      {
+         nir_def *exporter_addr = pervertex_lds_addr(b, gs_exporter_tid, pervertex_lds_bytes);
+         nir_def *prim_exp_arg = nir_load_var(b, prim_exp_arg_var);
+
+         /* Store the primitive export argument into the address of the exporter thread. */
+         nir_store_shared(b, prim_exp_arg, exporter_addr, .base = lds_es_pos_x);
+      }
+      nir_pop_if(b, if_gs_accepted);
+
+      nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
+                     .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
+
+      nir_def *gs_survived = nir_ilt(b, invocation_index, num_live_primitives_in_workgroup);
+      nir_if *if_packed_gs_thread = nir_push_if(b, gs_survived);
+      {
+         /* Load the primitive export argument that the current thread will export. */
+         nir_def *prim_exp_arg = nir_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
+
+         nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u);
+      }
+      nir_push_else(b, if_packed_gs_thread);
+      {
+         nir_store_var(b, prim_exp_arg_var, nir_undef(b, 1, 32), 0x1u);
+      }
+      nir_pop_if(b, if_packed_gs_thread);
+
+      nir_store_var(b, gs_accepted_var, gs_survived, 0x1u);
+      nir_store_var(b, s->gs_exported_var, gs_survived, 0x1u);
+   }
 }
 
 static void
@@ -1649,20 +1693,28 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
       nir_pop_if(b, if_es_thread);
 
       nir_def *es_accepted = nir_load_var(b, s->es_accepted_var);
+      nir_def *gs_accepted = nir_load_var(b, s->gs_accepted_var);
 
-      /* Repack the vertices that survived the culling. */
-      nir_def *accepted[] = { es_accepted };
-      wg_repack_result rep[1] = {0};
-      repack_invocations_in_workgroup(b, accepted, rep, 1, lds_scratch_base,
+      /* Repack the vertices (always) and primitives (optional) that survived the culling. */
+      nir_def *accepted[] = { es_accepted, gs_accepted };
+      wg_repack_result rep[2] = {0};
+      const unsigned num_rep = s->options->compact_primitives ? 2 : 1;
+      repack_invocations_in_workgroup(b, accepted, rep, num_rep, lds_scratch_base,
                                       s->max_num_waves, s->options->wave_size);
       nir_def *num_live_vertices_in_workgroup = rep[0].num_repacked_invocations;
       nir_def *es_exporter_tid = rep[0].repacked_invocation_index;
+      nir_def *num_exported_prims = NULL;
+      nir_def *gs_exporter_tid = NULL;
 
-      /* If all vertices are culled, set primitive count to 0 as well. */
-      nir_def *num_exported_prims = nir_load_workgroup_num_input_primitives_amd(b);
-      nir_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
-      num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
-      nir_store_var(b, s->gs_exported_var, nir_iand(b, nir_inot(b, fully_culled), has_input_primitive(b)), 0x1u);
+      if (s->options->compact_primitives) {
+         num_exported_prims = rep[1].num_repacked_invocations;
+         gs_exporter_tid = rep[1].repacked_invocation_index;
+      } else {
+         /* If all vertices are culled, set primitive count to 0 as well. */
+         nir_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
+         num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), nir_load_workgroup_num_input_primitives_amd(b));
+         nir_store_var(b, s->gs_exported_var, nir_iand(b, nir_inot(b, fully_culled), has_input_primitive(b)), 0x1u);
+      }
 
       nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0));
       {
@@ -1682,6 +1734,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
                                      repacked_variables, gs_vtxaddr_vars,
                                      invocation_index, es_vertex_lds_addr,
                                      es_exporter_tid, num_live_vertices_in_workgroup,
+                                     gs_exporter_tid, num_exported_prims,
                                      pervertex_lds_bytes, num_repacked_variables);
    }
    nir_push_else(b, if_cull_en);
@@ -3725,7 +3778,8 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
                             unsigned workgroup_size,
                             unsigned wave_size,
                             bool streamout_enabled,
-                            bool can_cull)
+                            bool can_cull,
+                            bool compact_primitives)
 {
    unsigned scratch_lds_size = 0;
    unsigned max_num_waves = DIV_ROUND_UP(workgroup_size, wave_size);
@@ -3735,7 +3789,9 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
          /* 4 dwords for 4 streamout buffer offset, 1 dword for emit prim count */
          scratch_lds_size = 20;
       } else if (can_cull) {
-         scratch_lds_size = ALIGN(max_num_waves, 4u);
+         /* 1 byte per wave per repack, max 8 waves */
+         unsigned num_rep = compact_primitives ? 2 : 1;
+         scratch_lds_size = ALIGN(max_num_waves, 4u) * num_rep;
       }
    } else {
       assert(stage == MESA_SHADER_GEOMETRY);
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index d2b024bedef..1a79f1f3acb 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -329,7 +329,8 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
                             unsigned workgroup_size,
                             unsigned wave_size,
                             bool streamout_enabled,
-                            bool can_cull);
+                            bool can_cull,
+                            bool compact_primitives);
 
 enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr);
 
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 0652efed287..bfb7eda4eb6 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -1680,7 +1680,7 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es
    /* Get scratch LDS usage. */
    const struct radv_shader_info *info = gs_info ? gs_info : es_info;
    const unsigned scratch_lds_size = ac_ngg_get_scratch_lds_size(info->stage, info->workgroup_size, info->wave_size,
-                                                                 pdev->use_ngg_streamout, info->has_ngg_culling);
+                                                                 pdev->use_ngg_streamout, info->has_ngg_culling, false);
    out->lds_size = out->scratch_lds_base + scratch_lds_size;
 
    unsigned workgroup_size =
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 1bd47791fd4..19789406ec8 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -35,7 +35,8 @@ unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
                                       si_get_max_workgroup_size(shader),
                                       shader->wave_size,
                                       si_shader_uses_streamout(shader),
-                                      si_shader_culling_enabled(shader)) / 4;
+                                      si_shader_culling_enabled(shader),
+                                      false) / 4;
 }
 
 /**