diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 1ce615b859b..6a32e64e159 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -93,6 +93,7 @@ ac_nir_lower_indirect_derefs(nir_shader *shader,
 
 typedef struct
 {
+   unsigned lds_bytes_if_culling_off;
    bool can_cull;
    bool passthrough;
 } ac_nir_ngg_config;
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 9cc7687a352..90dba07bb03 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -30,6 +30,8 @@ typedef struct
 {
    nir_variable *position_value_var;
    nir_variable *prim_exp_arg_var;
+   nir_variable *es_accepted_var;
+   nir_variable *gs_accepted_var;
 
    bool passthrough;
    bool export_prim_id;
@@ -69,6 +71,46 @@ typedef struct
    gs_output_component_info output_component_info[VARYING_SLOT_MAX][4];
 } lower_ngg_gs_state;
 
+typedef struct {
+   nir_variable *pre_cull_position_value_var;
+} remove_culling_shader_outputs_state;
+
+typedef struct {
+   nir_variable *pos_value_replacement;
+} remove_extra_position_output_state;
+
+typedef struct {
+   nir_ssa_def *reduction_result;
+   nir_ssa_def *excl_scan_result;
+} wg_scan_result;
+
+/* Per-vertex LDS layout of culling shaders */
+enum {
+   /* Position of the ES vertex (at the beginning for alignment reasons) */
+   lds_es_pos_x = 0,
+   lds_es_pos_y = 4,
+   lds_es_pos_z = 8,
+   lds_es_pos_w = 12,
+
+   /* 1 when the vertex is accepted, 0 if it should be culled */
+   lds_es_vertex_accepted = 16,
+   /* ID of the thread which will export the current thread's vertex */
+   lds_es_exporter_tid = 17,
+
+   /* Repacked arguments - also listed separately for VS and TES */
+   lds_es_arg_0 = 20,
+
+   /* VS arguments which need to be repacked */
+   lds_es_vs_vertex_id = 20,
+   lds_es_vs_instance_id = 24,
+
+   /* TES arguments which need to be repacked */
+   lds_es_tes_u = 20,
+   lds_es_tes_v = 24,
+   lds_es_tes_rel_patch_id = 28,
+   lds_es_tes_patch_id = 32,
+};
+
 typedef struct {
    nir_ssa_def *num_repacked_invocations;
    nir_ssa_def *repacked_invocation_index;
@@ -313,6 +355,467 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b)
                           .write_mask = 1u, .src_type = nir_type_uint32, .io_semantics = io_sem);
 }
 
+static bool
+remove_culling_shader_output(nir_builder *b, nir_instr *instr, void *state)
+{
+   remove_culling_shader_outputs_state *s = (remove_culling_shader_outputs_state *) state;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   /* These are not allowed in VS / TES */
+   assert(intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
+          intrin->intrinsic != nir_intrinsic_load_per_vertex_input);
+
+   /* We are only interested in output stores now */
+   if (intrin->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   b->cursor = nir_before_instr(instr);
+
+   /* Position output - store the value to a variable, remove output store */
+   nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
+   if (io_sem.location == VARYING_SLOT_POS) {
+      /* TODO: check if it's indirect, etc? */
+      unsigned writemask = nir_intrinsic_write_mask(intrin);
+      nir_ssa_def *store_val = intrin->src[0].ssa;
+      nir_store_var(b, s->pre_cull_position_value_var, store_val, writemask);
+   }
+
+   /* Remove all output stores */
+   nir_instr_remove(instr);
+   return true;
+}
+
+static void
+remove_culling_shader_outputs(nir_shader *culling_shader, lower_ngg_nogs_state *nogs_state, nir_variable *pre_cull_position_value_var)
+{
+   remove_culling_shader_outputs_state s = {
+      .pre_cull_position_value_var = pre_cull_position_value_var,
+   };
+
+   nir_shader_instructions_pass(culling_shader, remove_culling_shader_output,
+                                nir_metadata_block_index | nir_metadata_dominance, &s);
+
+   /* Remove dead code resulting from the deleted outputs. */
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, culling_shader, nir_opt_dead_write_vars);
+      NIR_PASS(progress, culling_shader, nir_opt_dce);
+      NIR_PASS(progress, culling_shader, nir_opt_dead_cf);
+   } while (progress);
+}
+
+static bool
+remove_extra_pos_output(nir_builder *b, nir_instr *instr, void *state)
+{
+   remove_extra_position_output_state *s = (remove_extra_position_output_state *) state;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   /* These are not allowed in VS / TES */
+   assert(intrin->intrinsic != nir_intrinsic_store_per_vertex_output &&
+          intrin->intrinsic != nir_intrinsic_load_per_vertex_input);
+
+   /* We are only interested in output stores now */
+   if (intrin->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
+   if (io_sem.location != VARYING_SLOT_POS)
+      return false;
+
+   b->cursor = nir_before_instr(instr);
+
+   /* TODO: in case other outputs use what we calculated for pos, rewrite the usages of the store components here */
+
+   nir_instr_remove(instr);
+   return true;
+}
+
+static void
+remove_extra_pos_outputs(nir_shader *shader, lower_ngg_nogs_state *nogs_state)
+{
+   remove_extra_position_output_state s = {
+      .pos_value_replacement = nogs_state->position_value_var,
+   };
+
+   nir_shader_instructions_pass(shader, remove_extra_pos_output,
+                                nir_metadata_block_index | nir_metadata_dominance, &s);
+}
+
+/**
+ * Perform vertex compaction after culling.
+ *
+ * 1. Repack surviving ES invocations (this determines which lane will export which vertex)
+ * 2. Surviving ES vertex invocations store their data to LDS
+ * 3. Emit GS_ALLOC_REQ
+ * 4. Repacked invocations load the vertex data from LDS
+ * 5. GS threads update their vertex indices
+ */
+static void
+compact_vertices_after_culling(nir_builder *b,
+                               lower_ngg_nogs_state *nogs_state,
+                               nir_variable *vertices_in_wave_var,
+                               nir_variable *primitives_in_wave_var,
+                               nir_variable **repacked_arg_vars,
+                               nir_variable **gs_vtxaddr_vars,
+                               nir_ssa_def *invocation_index,
+                               nir_ssa_def *es_vertex_lds_addr,
+                               unsigned ngg_scratch_lds_base_addr,
+                               unsigned pervertex_lds_bytes,
+                               unsigned max_exported_args)
+{
+   nir_variable *es_accepted_var = nogs_state->es_accepted_var;
+   nir_variable *gs_accepted_var = nogs_state->gs_accepted_var;
+   nir_variable *position_value_var = nogs_state->position_value_var;
+   nir_variable *prim_exp_arg_var = nogs_state->prim_exp_arg_var;
+
+   nir_ssa_def *es_accepted = nir_load_var(b, es_accepted_var);
+
+   /* Repack the vertices that survived the culling. */
+   wg_repack_result rep = repack_invocations_in_workgroup(b, es_accepted, ngg_scratch_lds_base_addr,
+                                                          nogs_state->max_num_waves, nogs_state->wave_size);
+   nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations;
+   nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index;
+
+   nir_if *if_es_accepted = nir_push_if(b, es_accepted);
+   {
+      nir_ssa_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes);
+
+      /* Store the exporter thread's index to the LDS space of the current thread so GS threads can load it */
+      nir_build_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid, .align_mul = 1u, .write_mask = 0x1u);
+
+      /* Store the current thread's position output to the exporter thread's LDS space */
+      nir_ssa_def *pos = nir_load_var(b, position_value_var);
+      nir_build_store_shared(b, pos, exporter_addr, .base = lds_es_pos_x, .align_mul = 4u, .write_mask = 0xfu);
+
+      /* Store the current thread's repackable arguments to the exporter thread's LDS space */
+      for (unsigned i = 0; i < max_exported_args; ++i) {
+         nir_ssa_def *arg_val = nir_load_var(b, repacked_arg_vars[i]);
+         nir_build_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i, .align_mul = 4u, .write_mask = 0x1u);
+      }
+   }
+   nir_pop_if(b, if_es_accepted);
+
+   /* If all vertices are culled, set primitive count to 0 as well. */
+   nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
+   num_exported_prims = nir_bcsel(b, nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u), nir_imm_int(b, 0u), num_exported_prims);
+
+   nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
+   {
+      /* Tell the final vertex and primitive count to the HW.
+       * We do this here to mask some of the latency of the LDS.
+       */
+      nir_build_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims);
+   }
+   nir_pop_if(b, if_wave_0);
+
+   /* Calculate the number of vertices and primitives left in the current wave */
+   nir_ssa_def *has_vtx_after_culling = nir_ilt(b, invocation_index, num_live_vertices_in_workgroup);
+   nir_ssa_def *has_prm_after_culling = nir_ilt(b, invocation_index, num_exported_prims);
+   nir_ssa_def *vtx_cnt = nir_bit_count(b, nir_build_ballot(b, 1, nogs_state->wave_size, has_vtx_after_culling));
+   nir_ssa_def *prm_cnt = nir_bit_count(b, nir_build_ballot(b, 1, nogs_state->wave_size, has_prm_after_culling));
+   nir_store_var(b, vertices_in_wave_var, vtx_cnt, 0x1u);
+   nir_store_var(b, primitives_in_wave_var, prm_cnt, 0x1u);
+
+   /* TODO: Consider adding a shortcut exit.
+    * Waves that have no vertices and primitives left can s_endpgm right here.
+    */
+
+   nir_scoped_barrier(b, .execution_scope=NIR_SCOPE_WORKGROUP, .memory_scope=NIR_SCOPE_WORKGROUP,
+                         .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
+
+   nir_if *if_packed_es_thread = nir_push_if(b, nir_ilt(b, invocation_index, num_live_vertices_in_workgroup));
+   {
+      /* Read position from the current ES thread's LDS space (written by the exported vertex's ES thread) */
+      nir_ssa_def *exported_pos = nir_build_load_shared(b, 4, 32, es_vertex_lds_addr, .base = lds_es_pos_x, .align_mul = 4u);
+      nir_store_var(b, position_value_var, exported_pos, 0xfu);
+
+      /* Read the repacked arguments */
+      for (unsigned i = 0; i < max_exported_args; ++i) {
+         nir_ssa_def *arg_val = nir_build_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_arg_0 + 4u * i, .align_mul = 4u);
+         nir_store_var(b, repacked_arg_vars[i], arg_val, 0x1u);
+      }
+   }
+   nir_pop_if(b, if_packed_es_thread);
+
+   nir_if *if_gs_accepted = nir_push_if(b, nir_load_var(b, gs_accepted_var));
+   {
+      nir_ssa_def *exporter_vtx_indices[3] = {0};
+
+      /* Load the index of the ES threads that will export the current GS thread's vertices */
+      for (unsigned v = 0; v < 3; ++v) {
+         nir_ssa_def *vtx_addr = nir_load_var(b, gs_vtxaddr_vars[v]);
+         nir_ssa_def *exporter_vtx_idx = nir_build_load_shared(b, 1, 8, vtx_addr, .base = lds_es_exporter_tid, .align_mul = 1u);
+         exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx);
+      }
+
+      nir_ssa_def *prim_exp_arg = emit_pack_ngg_prim_exp_arg(b, 3, exporter_vtx_indices, NULL);
+      nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u);
+   }
+   nir_pop_if(b, if_gs_accepted);
+}
+
+static void
+add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_cf, lower_ngg_nogs_state *nogs_state)
+{
+   assert(b->shader->info.outputs_written & (1 << VARYING_SLOT_POS));
+
+   bool uses_instance_id = BITSET_TEST(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+   bool uses_tess_primitive_id = BITSET_TEST(b->shader->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
+   unsigned max_exported_args = b->shader->info.stage == MESA_SHADER_VERTEX ? 2 : 4;
+   if (b->shader->info.stage == MESA_SHADER_VERTEX && !uses_instance_id)
+      max_exported_args--;
+   else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL && !uses_tess_primitive_id)
+      max_exported_args--;
+
+   unsigned pervertex_lds_bytes = lds_es_arg_0 + max_exported_args * 4u;
+   unsigned total_es_lds_bytes = pervertex_lds_bytes * nogs_state->max_es_num_vertices;
+   unsigned max_num_waves = nogs_state->max_num_waves;
+   unsigned ngg_scratch_lds_base_addr = ALIGN(total_es_lds_bytes, 8u);
+   unsigned ngg_scratch_lds_bytes = DIV_ROUND_UP(max_num_waves, 4u);
+   nogs_state->total_lds_bytes = ngg_scratch_lds_base_addr + ngg_scratch_lds_bytes;
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(b->shader);
+
+   /* Create some helper variables. */
+   nir_variable *position_value_var = nogs_state->position_value_var;
+   nir_variable *prim_exp_arg_var = nogs_state->prim_exp_arg_var;
+   nir_variable *gs_accepted_var = nogs_state->gs_accepted_var;
+   nir_variable *es_accepted_var = nogs_state->es_accepted_var;
+   nir_variable *vertices_in_wave_var = nir_local_variable_create(impl, glsl_uint_type(), "vertices_in_wave");
+   nir_variable *primitives_in_wave_var = nir_local_variable_create(impl, glsl_uint_type(), "primitives_in_wave");
+   nir_variable *gs_vtxaddr_vars[3] = {
+      nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx0_addr"),
+      nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx1_addr"),
+      nir_local_variable_create(impl, glsl_uint_type(), "gs_vtx2_addr"),
+   };
+   nir_variable *repacked_arg_vars[4] = {
+      nir_local_variable_create(impl, glsl_uint_type(), "repacked_arg_0"),
+      nir_local_variable_create(impl, glsl_uint_type(), "repacked_arg_1"),
+      nir_local_variable_create(impl, glsl_uint_type(), "repacked_arg_2"),
+      nir_local_variable_create(impl, glsl_uint_type(), "repacked_arg_3"),
+   };
+
+   /* Top part of the culling shader (aka. position shader part)
+    *
+    * We clone the full ES shader and emit it here, but we only really care
+    * about its position output, so we delete every other output from this part.
+    * The position output is stored into a temporary variable, and reloaded later.
+    */
+
+   b->cursor = nir_before_cf_list(&impl->body);
+
+   nir_if *if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
+   {
+      /* Initialize the position output variable to zeroes, in case not all VS/TES invocations store the output.
+       * The spec doesn't require it, but we use (0, 0, 0, 1) because some games rely on that.
+       */
+      nir_store_var(b, position_value_var, nir_imm_vec4(b, 0.0f, 0.0f, 0.0f, 1.0f), 0xfu);
+
+      /* Now reinsert a clone of the shader code */
+      struct hash_table *remap_table = _mesa_pointer_hash_table_create(NULL);
+      nir_cf_list_clone_and_reinsert(original_extracted_cf, &if_es_thread->cf_node, b->cursor, remap_table);
+      _mesa_hash_table_destroy(remap_table, NULL);
+      b->cursor = nir_after_cf_list(&if_es_thread->then_list);
+
+      /* Remember the current thread's shader arguments */
+      if (b->shader->info.stage == MESA_SHADER_VERTEX) {
+         nir_store_var(b, repacked_arg_vars[0], nir_build_load_vertex_id_zero_base(b), 0x1u);
+         if (uses_instance_id)
+            nir_store_var(b, repacked_arg_vars[1], nir_build_load_instance_id(b), 0x1u);
+      } else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
+         nir_ssa_def *tess_coord = nir_build_load_tess_coord(b);
+         nir_store_var(b, repacked_arg_vars[0], nir_channel(b, tess_coord, 0), 0x1u);
+         nir_store_var(b, repacked_arg_vars[1], nir_channel(b, tess_coord, 1), 0x1u);
+         nir_store_var(b, repacked_arg_vars[2], nir_build_load_tess_rel_patch_id_amd(b), 0x1u);
+         if (uses_tess_primitive_id)
+            nir_store_var(b, repacked_arg_vars[3], nir_build_load_primitive_id(b), 0x1u);
+      } else {
+         unreachable("Should be VS or TES.");
+      }
+   }
+   nir_pop_if(b, if_es_thread);
+
+   /* Remove all non-position outputs, and put the position output into the variable. */
+   nir_metadata_preserve(impl, nir_metadata_none);
+   remove_culling_shader_outputs(b->shader, nogs_state, position_value_var);
+   b->cursor = nir_after_cf_list(&impl->body);
+
+   /* Run culling algorithms if culling is enabled.
+    *
+    * NGG culling can be enabled or disabled in runtime.
+    * This is determined by a SGPR shader argument which is acccessed
+    * by the following NIR intrinsic.
+    */
+
+   nir_if *if_cull_en = nir_push_if(b, nir_build_load_cull_any_enabled_amd(b));
+   {
+      nir_ssa_def *invocation_index = nir_build_load_local_invocation_index(b);
+      nir_ssa_def *es_vertex_lds_addr = pervertex_lds_addr(b, invocation_index, pervertex_lds_bytes);
+
+      /* ES invocations store their vertex data to LDS for GS threads to read. */
+      if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
+      {
+         /* Store position components that are relevant to culling in LDS */
+         nir_ssa_def *pre_cull_pos = nir_load_var(b, position_value_var);
+         nir_ssa_def *pre_cull_w = nir_channel(b, pre_cull_pos, 3);
+         nir_build_store_shared(b, pre_cull_w, es_vertex_lds_addr, .write_mask = 0x1u, .align_mul = 4, .base = lds_es_pos_w);
+         nir_ssa_def *pre_cull_x_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 0), pre_cull_w);
+         nir_ssa_def *pre_cull_y_div_w = nir_fdiv(b, nir_channel(b, pre_cull_pos, 1), pre_cull_w);
+         nir_build_store_shared(b, nir_vec2(b, pre_cull_x_div_w, pre_cull_y_div_w), es_vertex_lds_addr, .write_mask = 0x3u, .align_mul = 4, .base = lds_es_pos_x);
+
+         /* Clear out the ES accepted flag in LDS */
+         nir_build_store_shared(b, nir_imm_zero(b, 1, 8), es_vertex_lds_addr, .write_mask = 0x1u, .align_mul = 4, .base = lds_es_vertex_accepted);
+      }
+      nir_pop_if(b, if_es_thread);
+
+      nir_scoped_barrier(b, .execution_scope=NIR_SCOPE_WORKGROUP, .memory_scope=NIR_SCOPE_WORKGROUP,
+                            .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
+
+      nir_store_var(b, gs_accepted_var, nir_imm_bool(b, false), 0x1u);
+      nir_store_var(b, prim_exp_arg_var, nir_imm_int(b, 1 << 31), 0x1u);
+
+      /* GS invocations load the vertex data and perform the culling. */
+      nir_if *if_gs_thread = nir_push_if(b, nir_build_has_input_primitive_amd(b));
+      {
+         /* Load vertex indices from input VGPRs */
+         nir_ssa_def *vtx_idx[3] = {0};
+         for (unsigned vertex = 0; vertex < 3; ++vertex)
+            vtx_idx[vertex] = ngg_input_primitive_vertex_index(b, vertex);
+
+         nir_ssa_def *vtx_addr[3] = {0};
+         nir_ssa_def *pos[3][4] = {0};
+
+         /* Load W positions of vertices first because the culling code will use these first */
+         for (unsigned vtx = 0; vtx < 3; ++vtx) {
+            vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes);
+            pos[vtx][3] = nir_build_load_shared(b, 1, 32, vtx_addr[vtx], .align_mul = 4u, .base = lds_es_pos_w);
+            nir_store_var(b, gs_vtxaddr_vars[vtx], vtx_addr[vtx], 0x1u);
+         }
+
+         /* Load the X/W, Y/W positions of vertices */
+         for (unsigned vtx = 0; vtx < 3; ++vtx) {
+            nir_ssa_def *xy = nir_build_load_shared(b, 2, 32, vtx_addr[vtx], .align_mul = 4u, .base = lds_es_pos_x);
+            pos[vtx][0] = nir_channel(b, xy, 0);
+            pos[vtx][1] = nir_channel(b, xy, 1);
+         }
+
+         /* See if the current primitive is accepted */
+         nir_ssa_def *accepted = ac_nir_cull_triangle(b, nir_imm_bool(b, true), pos);
+         nir_store_var(b, gs_accepted_var, accepted, 0x1u);
+
+         nir_if *if_gs_accepted = nir_push_if(b, accepted);
+         {
+            /* Store the accepted state to LDS for ES threads */
+            for (unsigned vtx = 0; vtx < 3; ++vtx)
+               nir_build_store_shared(b, nir_imm_intN_t(b, 0xff, 8), vtx_addr[vtx], .base = lds_es_vertex_accepted, .align_mul = 4u, .write_mask = 0x1u);
+         }
+         nir_pop_if(b, if_gs_accepted);
+      }
+      nir_pop_if(b, if_gs_thread);
+
+      nir_scoped_barrier(b, .execution_scope=NIR_SCOPE_WORKGROUP, .memory_scope=NIR_SCOPE_WORKGROUP,
+                            .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
+
+      nir_store_var(b, es_accepted_var, nir_imm_bool(b, false), 0x1u);
+
+      /* ES invocations load their accepted flag from LDS. */
+      if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
+      {
+         nir_ssa_def *accepted = nir_build_load_shared(b, 1, 8u, es_vertex_lds_addr, .base = lds_es_vertex_accepted, .align_mul = 4u);
+         nir_ssa_def *accepted_bool = nir_ine(b, accepted, nir_imm_intN_t(b, 0, 8));
+         nir_store_var(b, es_accepted_var, accepted_bool, 0x1u);
+      }
+      nir_pop_if(b, if_es_thread);
+
+      /* Vertex compaction. */
+      compact_vertices_after_culling(b, nogs_state,
+                                     vertices_in_wave_var, primitives_in_wave_var,
+                                     repacked_arg_vars, gs_vtxaddr_vars,
+                                     invocation_index, es_vertex_lds_addr,
+                                     ngg_scratch_lds_base_addr, pervertex_lds_bytes, max_exported_args);
+   }
+   nir_push_else(b, if_cull_en);
+   {
+      /* When culling is disabled, we do the same as we would without culling. */
+      nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
+      {
+         nir_ssa_def *vtx_cnt = nir_build_load_workgroup_num_input_vertices_amd(b);
+         nir_ssa_def *prim_cnt = nir_build_load_workgroup_num_input_primitives_amd(b);
+         nir_build_alloc_vertices_and_primitives_amd(b, vtx_cnt, prim_cnt);
+      }
+      nir_pop_if(b, if_wave_0);
+      nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, nogs_state), 0x1u);
+
+      nir_ssa_def *vtx_cnt = nir_bit_count(b, nir_build_ballot(b, 1, nogs_state->wave_size, nir_build_has_input_vertex_amd(b)));
+      nir_ssa_def *prm_cnt = nir_bit_count(b, nir_build_ballot(b, 1, nogs_state->wave_size, nir_build_has_input_primitive_amd(b)));
+      nir_store_var(b, vertices_in_wave_var, vtx_cnt, 0x1u);
+      nir_store_var(b, primitives_in_wave_var, prm_cnt, 0x1u);
+   }
+   nir_pop_if(b, if_cull_en);
+
+   /* Update shader arguments.
+    *
+    * The registers which hold information about the subgroup's
+    * vertices and primitives are updated here, so the rest of the shader
+    * doesn't need to worry about the culling.
+    *
+    * These "overwrite" intrinsics must be at top level control flow,
+    * otherwise they can mess up the backend (eg. ACO's SSA).
+    *
+    * TODO:
+    * A cleaner solution would be to simply replace all usages of these args
+    * with the load of the variables.
+    * However, this wouldn't work right now because the backend uses the arguments
+    * for purposes not expressed in NIR, eg. VS input loads, etc.
+    * This can change if VS input loads and other stuff are lowered to eg. load_buffer_amd.
+    */
+
+   if (b->shader->info.stage == MESA_SHADER_VERTEX)
+      nir_build_overwrite_vs_arguments_amd(b,
+         nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]));
+   else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL)
+      nir_build_overwrite_tes_arguments_amd(b,
+         nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]),
+         nir_load_var(b, repacked_arg_vars[2]), nir_load_var(b, repacked_arg_vars[3]));
+   else
+      unreachable("Should be VS or TES.");
+
+   nir_ssa_def *vertices_in_wave = nir_load_var(b, vertices_in_wave_var);
+   nir_ssa_def *primitives_in_wave = nir_load_var(b, primitives_in_wave_var);
+   nir_build_overwrite_subgroup_num_vertices_and_primitives_amd(b, vertices_in_wave, primitives_in_wave);
+}
+
+static bool
+can_use_deferred_attribute_culling(nir_shader *shader)
+{
+   /* When the shader writes memory, it is difficult to guarantee correctness.
+    * Future work:
+    * - if only write-only SSBOs are used
+    * - if we can prove that non-position outputs don't rely on memory stores
+    * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
+    */
+   if (shader->info.writes_memory)
+      return false;
+
+   /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
+    * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
+    */
+   if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
+      return false;
+
+   return true;
+}
+
 ac_nir_ngg_config
 ac_nir_lower_ngg_nogs(nir_shader *shader,
                       unsigned max_num_es_vertices,
@@ -328,12 +831,15 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    assert(impl);
    assert(max_num_es_vertices && max_workgroup_size && wave_size);
 
-   bool can_cull = false; /* TODO */
+   bool can_cull = consider_culling && (num_vertices_per_primitives == 3) &&
+                   can_use_deferred_attribute_culling(shader);
    bool passthrough = consider_passthrough && !can_cull &&
                       !(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id);
 
    nir_variable *position_value_var = nir_local_variable_create(impl, glsl_vec4_type(), "position_value");
    nir_variable *prim_exp_arg_var = nir_local_variable_create(impl, glsl_uint_type(), "prim_exp_arg");
+   nir_variable *es_accepted_var = can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL;
+   nir_variable *gs_accepted_var = can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL;
 
    lower_ngg_nogs_state state = {
       .passthrough = passthrough,
@@ -343,6 +849,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       .provoking_vtx_idx = provoking_vtx_last ? (num_vertices_per_primitives - 1) : 0,
       .position_value_var = position_value_var,
       .prim_exp_arg_var = prim_exp_arg_var,
+      .es_accepted_var = es_accepted_var,
+      .gs_accepted_var = gs_accepted_var,
       .max_num_waves = DIV_ROUND_UP(max_workgroup_size, wave_size),
       .max_es_num_vertices = max_num_es_vertices,
       .wave_size = wave_size,
@@ -352,12 +860,15 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    if (shader->info.stage == MESA_SHADER_VERTEX && export_prim_id)
       state.total_lds_bytes = max_num_es_vertices * 4u;
 
-   nir_cf_list extracted;
-   nir_cf_extract(&extracted, nir_before_cf_list(&impl->body), nir_after_cf_list(&impl->body));
+   /* The shader only needs this much LDS when culling is turned off. */
+   unsigned lds_bytes_if_culling_off = state.total_lds_bytes;
 
    nir_builder builder;
    nir_builder *b = &builder; /* This is to avoid the & */
    nir_builder_init(b, impl);
+
+   nir_cf_list extracted;
+   nir_cf_extract(&extracted, nir_before_cf_list(&impl->body), nir_after_cf_list(&impl->body));
    b->cursor = nir_before_cf_list(&impl->body);
 
    if (!can_cull) {
@@ -376,23 +887,23 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       else
          nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, &state), 0x1u);
    } else {
-      abort(); /* TODO */
+      add_deferred_attribute_culling(b, &extracted, &state);
+      b->cursor = nir_after_cf_list(&impl->body);
+
+      if (state.early_prim_export)
+         emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, state.prim_exp_arg_var));
    }
 
+   nir_intrinsic_instr *export_vertex_instr;
+
    nir_if *if_es_thread = nir_push_if(b, nir_build_has_input_vertex_amd(b));
    {
-      if (can_cull) {
-         nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
-         nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
-         nir_build_store_output(b, pos_val, nir_imm_int(b, 0), .base = VARYING_SLOT_POS, .component = 0, .io_semantics = io_sem, .write_mask = 0xfu);
-      }
-
       /* Run the actual shader */
       nir_cf_reinsert(&extracted, b->cursor);
       b->cursor = nir_after_cf_list(&if_es_thread->then_list);
 
       /* Export all vertex attributes (except primitive ID) */
-      nir_build_export_vertex_amd(b);
+      export_vertex_instr = nir_build_export_vertex_amd(b);
 
       /* Export primitive ID (in case of early primitive export or TES) */
       if (state.export_prim_id && (state.early_prim_export || shader->info.stage != MESA_SHADER_VERTEX))
@@ -410,17 +921,50 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       }
    }
 
+   if (can_cull) {
+      /* Remove the redundant position output. */
+      remove_extra_pos_outputs(shader, &state);
+
+      /* After looking at the performance in apps eg. Doom Eternal, and The Witcher 3,
+       * it seems that it's best to put the position export always at the end, and
+       * then let ACO schedule it up (slightly) only when early prim export is used.
+       */
+      b->cursor = nir_before_instr(&export_vertex_instr->instr);
+
+      nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
+      nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
+      nir_build_store_output(b, pos_val, nir_imm_int(b, 0), .base = VARYING_SLOT_POS, .component = 0, .io_semantics = io_sem, .write_mask = 0xfu);
+   }
+
    nir_metadata_preserve(impl, nir_metadata_none);
    nir_validate_shader(shader, "after emitting NGG VS/TES");
 
    /* Cleanup */
+   nir_opt_dead_write_vars(shader);
    nir_lower_vars_to_ssa(shader);
    nir_remove_dead_variables(shader, nir_var_function_temp, NULL);
-   nir_opt_undef(shader);
+   nir_lower_alu_to_scalar(shader, NULL, NULL);
+   nir_lower_phis_to_scalar(shader, true);
+
+   if (can_cull) {
+      /* It's beneficial to redo these opts after splitting the shader. */
+      nir_opt_sink(shader, nir_move_load_input | nir_move_const_undef | nir_move_copies);
+      nir_opt_move(shader, nir_move_load_input | nir_move_copies | nir_move_const_undef);
+   }
+
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, shader, nir_opt_undef);
+      NIR_PASS(progress, shader, nir_opt_cse);
+      NIR_PASS(progress, shader, nir_opt_dce);
+      NIR_PASS(progress, shader, nir_opt_dead_cf);
+   } while (progress);
 
    shader->info.shared_size = state.total_lds_bytes;
 
    ac_nir_ngg_config ret = {
+      .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
       .can_cull = can_cull,
       .passthrough = passthrough,
    };