From 54ae38042ab94fbb81731426db2040949464b1fd Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 14 Dec 2022 17:52:07 +0000 Subject: [PATCH] ac/nir: remove num_es_threads_var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bit count of es_accepted works for both when ngg is and isn't dynamically enabled. Unlike the other sequence, this should only be a single SALU instruction. fossil-db (gfx1100, nggc): Totals from 41388 (30.75% of 134574) affected shaders: Instrs: 25783544 -> 25432959 (-1.36%); split: -1.36%, +0.00% CodeSize: 127281160 -> 125878820 (-1.10%); split: -1.10%, +0.00% Latency: 92849566 -> 92723047 (-0.14%); split: -0.14%, +0.00% InvThroughput: 9542194 -> 9485012 (-0.60%); split: -0.60%, +0.00% Copies: 2031074 -> 1928796 (-5.04%); split: -5.04%, +0.00% Branches: 642407 -> 642409 (+0.00%) Signed-off-by: Rhys Perry Reviewed-by: Qiang Yu Reviewed-by: Timur Kristóf Part-of: --- src/amd/common/ac_nir_lower_ngg.c | 37 +++++++++++-------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 75519b84b61..9cb28181ee9 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -65,7 +65,6 @@ typedef struct nir_variable *prim_exp_arg_var; nir_variable *es_accepted_var; nir_variable *gs_accepted_var; - nir_variable *num_es_threads_var; nir_variable *gs_vtx_indices_vars[3]; nir_ssa_def *vtx_addr[3]; @@ -1543,18 +1542,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations; nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index; - if (nogs_state->num_es_threads_var) { - nir_ssa_def *num_live_vertices_in_wave = num_live_vertices_in_workgroup; - if (nogs_state->max_num_waves > 1) { - num_live_vertices_in_wave = - nir_usub_sat(b, num_live_vertices_in_wave, - nir_imul_imm(b, nir_load_subgroup_id(b), nogs_state->options->wave_size)); - num_live_vertices_in_wave = nir_umin(b, num_live_vertices_in_wave, - nir_imm_int(b, nogs_state->options->wave_size)); - } - nir_store_var(b, nogs_state->num_es_threads_var, num_live_vertices_in_wave, 0x1); - } - /* If all vertices are culled, set primitive count to 0 as well. */ nir_ssa_def *num_exported_prims = nir_load_workgroup_num_input_primitives_amd(b); nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u); @@ -1585,9 +1572,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c } nir_pop_if(b, if_wave_0); nir_store_var(b, prim_exp_arg_var, emit_ngg_nogs_prim_exp_arg(b, nogs_state), 0x1u); - - if (nogs_state->num_es_threads_var) - nir_store_var(b, nogs_state->num_es_threads_var, nir_load_merged_wave_info_amd(b), 0x1); } nir_pop_if(b, if_cull_en); @@ -2097,10 +2081,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "es_accepted") : NULL; nir_variable *gs_accepted_var = options->can_cull ? nir_local_variable_create(impl, glsl_bool_type(), "gs_accepted") : NULL; - nir_variable *num_es_threads_var = - options->can_cull && options->gfx_level >= GFX11 - ? nir_local_variable_create(impl, glsl_uint_type(), "num_es_threads") - : NULL; bool streamout_enabled = shader->xfb_info && !options->disable_streamout; bool has_user_edgeflags = @@ -2124,7 +2104,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option .prim_exp_arg_var = prim_exp_arg_var, .es_accepted_var = es_accepted_var, .gs_accepted_var = gs_accepted_var, - .num_es_threads_var = num_es_threads_var, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), .has_user_edgeflags = has_user_edgeflags, }; @@ -2211,6 +2190,16 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option nir_ssa_def *es_thread = options->can_cull ? nir_load_var(b, es_accepted_var) : has_input_vertex(b); + /* Calculate the bit count here instead of below for lower SGPR usage and better ALU + * scheduling. + */ + nir_ssa_def *num_es_threads = NULL; + if (state.options->gfx_level >= GFX11 && options->can_cull) { + nir_ssa_def *es_accepted_mask = + nir_ballot(b, 1, options->wave_size, nir_load_var(b, es_accepted_var)); + num_es_threads = nir_bit_count(b, es_accepted_mask); + } + nir_if *if_es_thread = nir_push_if(b, es_thread); { /* Run the actual shader */ @@ -2256,9 +2245,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option b->cursor = nir_after_cf_list(&impl->body); - nir_ssa_def *num_threads = options->can_cull ? nir_load_var(b, num_es_threads_var) - : nir_load_merged_wave_info_amd(b); - export_vertex_params_gfx11(b, NULL, num_threads, num_outputs, outputs, + if (!num_es_threads) + num_es_threads = nir_load_merged_wave_info_amd(b); + export_vertex_params_gfx11(b, NULL, num_es_threads, num_outputs, outputs, options->vs_output_param_offset); } }