ac/nir/ngg: Implement optional primitive compaction.

It's an experimental feature that we may enable later.
Instead of exporting NULL primitives, perform a compaction
on primitives after culling.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32290>
This commit is contained in:
Timur Kristóf 2024-11-23 12:11:45 +01:00
parent 492d8f3778
commit 45c523104a
5 changed files with 74 additions and 15 deletions

View file

@ -134,6 +134,7 @@ typedef struct {
bool kill_pointsize;
bool kill_layer;
bool force_vrs;
bool compact_primitives;
/* VS */
unsigned num_vertices_per_primitive;

View file

@ -983,6 +983,7 @@ cleanup_culling_shader_after_dce(nir_shader *shader,
* 3. Emit GS_ALLOC_REQ
* 4. Repacked invocations load the vertex data from LDS
* 5. GS threads update their vertex indices
* 6. Optionally, do the same for primitives.
*/
static void
compact_vertices_after_culling(nir_builder *b,
@ -993,6 +994,8 @@ compact_vertices_after_culling(nir_builder *b,
nir_def *es_vertex_lds_addr,
nir_def *es_exporter_tid,
nir_def *num_live_vertices_in_workgroup,
nir_def *gs_exporter_tid,
nir_def *num_live_primitives_in_workgroup,
unsigned pervertex_lds_bytes,
unsigned num_repacked_variables)
{
@ -1066,7 +1069,8 @@ compact_vertices_after_culling(nir_builder *b,
}
nir_pop_if(b, if_packed_es_thread);
nir_if *if_gs_accepted = nir_push_if(b, nir_load_var(b, gs_accepted_var));
nir_def *gs_accepted = nir_load_var(b, gs_accepted_var);
nir_if *if_gs_accepted = nir_push_if(b, gs_accepted);
{
nir_def *exporter_vtx_indices[3] = {0};
@ -1086,6 +1090,46 @@ compact_vertices_after_culling(nir_builder *b,
nir_pop_if(b, if_gs_accepted);
nir_store_var(b, es_accepted_var, es_survived, 0x1u);
if (s->options->compact_primitives) {
/* For primitive compaction, re-use the same LDS space that we used for
* vertex compaction, so we need to wait until vertex threads are finished reading it.
* Considering we only need 1 DWORD per primitive, let's assume we always have enough space,
* since vertex compaction requires at least 5 DWORDs per vertex.
*/
nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
if_gs_accepted = nir_push_if(b, gs_accepted);
{
nir_def *exporter_addr = pervertex_lds_addr(b, gs_exporter_tid, pervertex_lds_bytes);
nir_def *prim_exp_arg = nir_load_var(b, prim_exp_arg_var);
/* Store the primitive export argument into the address of the exporter thread. */
nir_store_shared(b, prim_exp_arg, exporter_addr, .base = lds_es_pos_x);
}
nir_pop_if(b, if_gs_accepted);
nir_barrier(b, .execution_scope=SCOPE_WORKGROUP, .memory_scope=SCOPE_WORKGROUP,
.memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared);
nir_def *gs_survived = nir_ilt(b, invocation_index, num_live_primitives_in_workgroup);
nir_if *if_packed_gs_thread = nir_push_if(b, gs_survived);
{
/* Load the primitive export argument that the current thread will export. */
nir_def *prim_exp_arg = nir_load_shared(b, 1, 32, es_vertex_lds_addr, .base = lds_es_pos_x);
nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u);
}
nir_push_else(b, if_packed_gs_thread);
{
nir_store_var(b, prim_exp_arg_var, nir_undef(b, 1, 32), 0x1u);
}
nir_pop_if(b, if_packed_gs_thread);
nir_store_var(b, gs_accepted_var, gs_survived, 0x1u);
nir_store_var(b, s->gs_exported_var, gs_survived, 0x1u);
}
}
static void
@ -1649,20 +1693,28 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
nir_pop_if(b, if_es_thread);
nir_def *es_accepted = nir_load_var(b, s->es_accepted_var);
nir_def *gs_accepted = nir_load_var(b, s->gs_accepted_var);
/* Repack the vertices that survived the culling. */
nir_def *accepted[] = { es_accepted };
wg_repack_result rep[1] = {0};
repack_invocations_in_workgroup(b, accepted, rep, 1, lds_scratch_base,
/* Repack the vertices (always) and primitives (optional) that survived the culling. */
nir_def *accepted[] = { es_accepted, gs_accepted };
wg_repack_result rep[2] = {0};
const unsigned num_rep = s->options->compact_primitives ? 2 : 1;
repack_invocations_in_workgroup(b, accepted, rep, num_rep, lds_scratch_base,
s->max_num_waves, s->options->wave_size);
nir_def *num_live_vertices_in_workgroup = rep[0].num_repacked_invocations;
nir_def *es_exporter_tid = rep[0].repacked_invocation_index;
nir_def *num_exported_prims = NULL;
nir_def *gs_exporter_tid = NULL;
/* If all vertices are culled, set primitive count to 0 as well. */
nir_def *num_exported_prims = nir_load_workgroup_num_input_primitives_amd(b);
nir_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
nir_store_var(b, s->gs_exported_var, nir_iand(b, nir_inot(b, fully_culled), has_input_primitive(b)), 0x1u);
if (s->options->compact_primitives) {
num_exported_prims = rep[1].num_repacked_invocations;
gs_exporter_tid = rep[1].repacked_invocation_index;
} else {
/* If all vertices are culled, set primitive count to 0 as well. */
nir_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), nir_load_workgroup_num_input_primitives_amd(b));
nir_store_var(b, s->gs_exported_var, nir_iand(b, nir_inot(b, fully_culled), has_input_primitive(b)), 0x1u);
}
nir_if *if_wave_0 = nir_push_if(b, nir_ieq_imm(b, nir_load_subgroup_id(b), 0));
{
@ -1682,6 +1734,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
repacked_variables, gs_vtxaddr_vars,
invocation_index, es_vertex_lds_addr,
es_exporter_tid, num_live_vertices_in_workgroup,
gs_exporter_tid, num_exported_prims,
pervertex_lds_bytes, num_repacked_variables);
}
nir_push_else(b, if_cull_en);
@ -3725,7 +3778,8 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
unsigned workgroup_size,
unsigned wave_size,
bool streamout_enabled,
bool can_cull)
bool can_cull,
bool compact_primitives)
{
unsigned scratch_lds_size = 0;
unsigned max_num_waves = DIV_ROUND_UP(workgroup_size, wave_size);
@ -3735,7 +3789,9 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
/* 4 dwords for 4 streamout buffer offset, 1 dword for emit prim count */
scratch_lds_size = 20;
} else if (can_cull) {
scratch_lds_size = ALIGN(max_num_waves, 4u);
/* 1 byte per wave per repack, max 8 waves */
unsigned num_rep = compact_primitives ? 2 : 1;
scratch_lds_size = ALIGN(max_num_waves, 4u) * num_rep;
}
} else {
assert(stage == MESA_SHADER_GEOMETRY);

View file

@ -329,7 +329,8 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
unsigned workgroup_size,
unsigned wave_size,
bool streamout_enabled,
bool can_cull);
bool can_cull,
bool compact_primitives);
enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr);

View file

@ -1680,7 +1680,7 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es
/* Get scratch LDS usage. */
const struct radv_shader_info *info = gs_info ? gs_info : es_info;
const unsigned scratch_lds_size = ac_ngg_get_scratch_lds_size(info->stage, info->workgroup_size, info->wave_size,
pdev->use_ngg_streamout, info->has_ngg_culling);
pdev->use_ngg_streamout, info->has_ngg_culling, false);
out->lds_size = out->scratch_lds_base + scratch_lds_size;
unsigned workgroup_size =

View file

@ -35,7 +35,8 @@ unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
si_get_max_workgroup_size(shader),
shader->wave_size,
si_shader_uses_streamout(shader),
si_shader_culling_enabled(shader)) / 4;
si_shader_culling_enabled(shader),
false) / 4;
}
/**