ac/nir: Use gs_accepted variable after culling.

This prevents us from recalculating the EXEC mask later
in the shader, and removes the requirement for
counting the number of primitives.

The stats are better than expected because they also
show that some code that is still there is now DCE'd by ACO.

Fossil DB results on Sienna Cichlid (with NGGC on):

Totals from 58239 (45.27% of 128647) affected shaders:
SpillSGPRs: 330 -> 340 (+3.03%)
CodeSize: 166356072 -> 162805724 (-2.13%)
Instrs: 31920041 -> 31089256 (-2.60%)
Latency: 138815742 -> 138113669 (-0.51%); split: -0.54%, +0.03%
InvThroughput: 22459553 -> 22404840 (-0.24%); split: -0.26%, +0.02%
SClause: 753746 -> 753765 (+0.00%); split: -0.00%, +0.01%
Copies: 3226647 -> 3268973 (+1.31%); split: -0.45%, +1.76%
Branches: 1223441 -> 1223440 (-0.00%); split: -0.00%, +0.00%
PreSGPRs: 2025339 -> 2091013 (+3.24%)

No Fossil DB changes with NGGC off.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11908>
This commit is contained in:
Timur Kristóf 2021-07-15 14:32:34 +02:00 committed by Marge Bot
parent 8159868699
commit a2d02c0c11

View file

@ -318,7 +318,11 @@ emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *st)
static void
emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def *arg)
{
nir_if *if_gs_thread = nir_push_if(b, nir_build_has_input_primitive_amd(b));
nir_ssa_def *gs_thread = st->gs_accepted_var
? nir_load_var(b, st->gs_accepted_var)
: nir_build_has_input_primitive_amd(b);
nir_if *if_gs_thread = nir_push_if(b, gs_thread);
{
if (!arg)
arg = emit_ngg_nogs_prim_exp_arg(b, st);
@ -584,7 +588,8 @@ compact_vertices_after_culling(nir_builder *b,
/* If all vertices are culled, set primitive count to 0 as well. */
nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
num_exported_prims = nir_bcsel(b, nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u), nir_imm_int(b, 0u), num_exported_prims);
nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
{
@ -642,6 +647,7 @@ compact_vertices_after_culling(nir_builder *b,
nir_pop_if(b, if_gs_accepted);
nir_store_var(b, es_accepted_var, es_survived, 0x1u);
nir_store_var(b, gs_accepted_var, nir_bcsel(b, fully_culled, nir_imm_false(b), nir_build_has_input_primitive_amd(b)), 0x1u);
}
static void
@ -958,6 +964,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_pop_if(b, if_es_thread);
nir_store_var(b, es_accepted_var, es_thread, 0x1u);
nir_store_var(b, gs_accepted_var, nir_build_has_input_primitive_amd(b), 0x1u);
/* Remove all non-position outputs, and put the position output into the variable. */
nir_metadata_preserve(impl, nir_metadata_none);