ac/nir: Use gs_accepted variable after culling.

This prevents us from recalculating the EXEC mask later in the shader, and removes the requirement for counting the number of primitives. The stats are better than expected because they also show that some code that is still there is now DCE'd by ACO. Fossil DB results on Sienna Cichlid (with NGGC on): Totals from 58239 (45.27% of 128647) affected shaders: SpillSGPRs: 330 -> 340 (+3.03%) CodeSize: 166356072 -> 162805724 (-2.13%) Instrs: 31920041 -> 31089256 (-2.60%) Latency: 138815742 -> 138113669 (-0.51%); split: -0.54%, +0.03% InvThroughput: 22459553 -> 22404840 (-0.24%); split: -0.26%, +0.02% SClause: 753746 -> 753765 (+0.00%); split: -0.00%, +0.01% Copies: 3226647 -> 3268973 (+1.31%); split: -0.45%, +1.76% Branches: 1223441 -> 1223440 (-0.00%); split: -0.00%, +0.00% PreSGPRs: 2025339 -> 2091013 (+3.24%) No Fossil DB changes with NGGC off. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11908>
2026-04-25 00:30:37 +02:00 · 2021-07-15 14:32:34 +02:00 · 2021-07-15 14:32:34 +02:00 · a2d02c0c11
commit a2d02c0c11
parent 8159868699
1 changed files with 9 additions and 2 deletions
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@ -318,7 +318,11 @@ emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *st)
 static void
 emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def *arg)
 {
-   nir_if *if_gs_thread = nir_push_if(b, nir_build_has_input_primitive_amd(b));
+   nir_ssa_def *gs_thread = st->gs_accepted_var
+                            ? nir_load_var(b, st->gs_accepted_var)
+                            : nir_build_has_input_primitive_amd(b);
+
+   nir_if *if_gs_thread = nir_push_if(b, gs_thread);
   {
      if (!arg)
         arg = emit_ngg_nogs_prim_exp_arg(b, st);
@ -584,7 +588,8 @@ compact_vertices_after_culling(nir_builder *b,

   /* If all vertices are culled, set primitive count to 0 as well. */
   nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
-   num_exported_prims = nir_bcsel(b, nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u), nir_imm_int(b, 0u), num_exported_prims);
+   nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
+   num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);

   nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
   {
@ -642,6 +647,7 @@ compact_vertices_after_culling(nir_builder *b,
   nir_pop_if(b, if_gs_accepted);

   nir_store_var(b, es_accepted_var, es_survived, 0x1u);
+   nir_store_var(b, gs_accepted_var, nir_bcsel(b, fully_culled, nir_imm_false(b), nir_build_has_input_primitive_amd(b)), 0x1u);
 }

 static void
@ -958,6 +964,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
   nir_pop_if(b, if_es_thread);

   nir_store_var(b, es_accepted_var, es_thread, 0x1u);
+   nir_store_var(b, gs_accepted_var, nir_build_has_input_primitive_amd(b), 0x1u);

   /* Remove all non-position outputs, and put the position output into the variable. */
   nir_metadata_preserve(impl, nir_metadata_none);