ac/nir: remove num_es_threads_var

A bit count of es_accepted works for both when ngg is and isn't dynamically enabled. Unlike the other sequence, this should only be a single SALU instruction. fossil-db (gfx1100, nggc): Totals from 41388 (30.75% of 134574) affected shaders: Instrs: 25783544 -> 25432959 (-1.36%); split: -1.36%, +0.00% CodeSize: 127281160 -> 125878820 (-1.10%); split: -1.10%, +0.00% Latency: 92849566 -> 92723047 (-0.14%); split: -0.14%, +0.00% InvThroughput: 9542194 -> 9485012 (-0.60%); split: -0.60%, +0.00% Copies: 2031074 -> 1928796 (-5.04%); split: -5.04%, +0.00% Branches: 642407 -> 642409 (+0.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Qiang Yu <yuq825@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20321>
2026-05-09 02:28:10 +02:00 · 2022-12-14 17:52:07 +00:00 · 2022-12-14 17:52:07 +00:00 · 54ae38042a
commit 54ae38042a
parent 69e55d9c1b
1 changed files with 13 additions and 24 deletions
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@ -65,7 +65,6 @@ typedef struct
   nir_variable *prim_exp_arg_var;
   nir_variable *es_accepted_var;
   nir_variable *gs_accepted_var;
-   nir_variable *num_es_threads_var;
   nir_variable *gs_vtx_indices_vars[3];

   nir_ssa_def *vtx_addr[3];
@ -1543,18 +1542,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
      nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations;
      nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index;

-      if (nogs_state->num_es_threads_var) {
-         nir_ssa_def *num_live_vertices_in_wave = num_live_vertices_in_workgroup;
-         if (nogs_state->max_num_waves > 1) {
-            num_live_vertices_in_wave =
-               nir_usub_sat(b, num_live_vertices_in_wave,
-                            nir_imul_imm(b, nir_load_subgroup_id(b), nogs_state->options->wave_size));
-            num_live_vertices_in_wave = nir_umin(b, num_live_vertices_in_wave,
-                                                 nir_imm_int(b, nogs_state->options->wave_size));
-         }
-         nir_store_var(b, nogs_state->num_es_threads_var, num_live_vertices_in_wave, 0x1);
-      }
-
      /* If all vertices are culled, set primitive count to 0 as well. */
      nir_ssa_def *num_exported_prims = nir_load_workgroup_num_input_primitives_amd(b);
      nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
@ -1585,9 +1572,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
      }
      nir_pop_if(b, if_wave_0);
      nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, nogs_state), 0x1u);
-
-      if (nogs_state->num_es_threads_var)
-         nir_store_var(b, nogs_state->num_es_threads_var, nir_load_merged_wave_info_amd(b), 0x1);
   }
   nir_pop_if(b, if_cull_en);

@ -2097,10 +2081,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
      options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL;
   nir_variable *gs_accepted_var =
      options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL;
-   nir_variable *num_es_threads_var =
-      options->can_cull && options->gfx_level >= GFX11
-         ? nir_local_variable_create(impl, glsl_uint_type(), "num_es_threads")
-         : NULL;

   bool streamout_enabled = shader->xfb_info && !options->disable_streamout;
   bool has_user_edgeflags =
@ -2124,7 +2104,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
      .prim_exp_arg_var = prim_exp_arg_var,
      .es_accepted_var = es_accepted_var,
      .gs_accepted_var = gs_accepted_var,
-      .num_es_threads_var = num_es_threads_var,
      .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
      .has_user_edgeflags = has_user_edgeflags,
   };
@ -2211,6 +2190,16 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
   nir_ssa_def *es_thread =
      options->can_cull ? nir_load_var(b, es_accepted_var) : has_input_vertex(b);

+   /* Calculate the bit count here instead of below for lower SGPR usage and better ALU
+    * scheduling.
+    */
+   nir_ssa_def *num_es_threads = NULL;
+   if (state.options->gfx_level >= GFX11 && options->can_cull) {
+      nir_ssa_def *es_accepted_mask =
+         nir_ballot(b, 1, options->wave_size, nir_load_var(b, es_accepted_var));
+      num_es_threads = nir_bit_count(b, es_accepted_mask);
+   }
+
   nir_if *if_es_thread = nir_push_if(b, es_thread);
   {
      /* Run the actual shader */
@ -2256,9 +2245,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option

         b->cursor = nir_after_cf_list(&impl->body);

-         nir_ssa_def *num_threads = options->can_cull ? nir_load_var(b, num_es_threads_var)
-                                                      : nir_load_merged_wave_info_amd(b);
-         export_vertex_params_gfx11(b, NULL, num_threads, num_outputs, outputs,
+         if (!num_es_threads)
+            num_es_threads = nir_load_merged_wave_info_amd(b);
+         export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs,
                                    options->vs_output_param_offset);
      }
   }