aco/ngg: Allocate NGG GS space early for const vertex/primitive counts.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6964>
2025-12-27 23:30:10 +01:00 · 2020-10-02 17:10:43 +02:00 · 2020-10-02 17:10:43 +02:00 · 61280bb4b6
commit 61280bb4b6
parent e8a0409d01
3 changed files with 59 additions and 14 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -10865,7 +10865,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Tem
   /* VS/TES: we infer the vertex and primitive count from arguments
    * GS: the caller needs to supply them
    */
-   assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY
+   assert((ctx->stage & sw_gs)
          ? (vtx_cnt.id() && prm_cnt.id())
          : (!vtx_cnt.id() && !prm_cnt.id()));

@ -11330,9 +11330,18 @@ void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg,
   begin_divergent_if_then(ctx, &ic, is_vtx_export_thread);
   bld.reset(ctx->block);

-   /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
-   Operand m = load_lds_size_m0(bld);
-   Temp exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
+   /* The index of the vertex that the current thread will export. */
+   Temp exported_vtx_idx;
+
+   if (ctx->ngg_gs_early_alloc) {
+      /* No vertex compaction necessary, the thread can export its own vertex. */
+      exported_vtx_idx = tid_in_tg;
+   } else {
+      /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
+      Operand m = load_lds_size_m0(bld);
+      exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
+   }
+
   /* Get the LDS address of the vertex that the current thread must export. */
   Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx);

@ -11367,6 +11376,19 @@ void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg,
   end_divergent_if(ctx, &ic);
 }

+void ngg_gs_prelude(isel_context *ctx)
+{
+   if (!ctx->ngg_gs_early_alloc)
+      return;
+
+   /* We know the GS writes the maximum possible number of vertices, so
+    * it's likely that most threads need to export a primitive, too.
+    * Thus, we won't have to worry about primitive compaction here.
+    */
+   Temp num_max_vertices = ngg_max_vertex_count(ctx);
+   ngg_emit_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices);
+}
+
 void ngg_gs_finale(isel_context *ctx)
 {
   if_context ic;
@ -11391,19 +11413,33 @@ void ngg_gs_finale(isel_context *ctx)
    */
   Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0));

-   /* Perform a workgroup reduction and exclusive scan. */
-   std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
-   bld.reset(ctx->block);
   /* Total number of vertices emitted by the workgroup. */
-   Temp wg_vtx_cnt = wg_scan.first;
+   Temp wg_vtx_cnt;
   /* ID of the thread which will export the current thread's vertex. */
-   Temp exporter_tid_in_tg = wg_scan.second;
-   /* Skip all exports when possible. */
-   Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
-   max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
+   Temp exporter_tid_in_tg;
+
+   if (ctx->ngg_gs_early_alloc) {
+      /* There is no need for a scan or vertex compaction, we know that
+       * the GS writes all possible vertices so each thread can export its own vertex.
+       */
+      wg_vtx_cnt = max_vtxcnt;
+      exporter_tid_in_tg = tid_in_tg;
+   } else {
+      /* Perform a workgroup reduction and exclusive scan. */
+      std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
+      bld.reset(ctx->block);
+      /* Total number of vertices emitted by the workgroup. */
+      wg_vtx_cnt = wg_scan.first;
+      /* ID of the thread which will export the current thread's vertex. */
+      exporter_tid_in_tg = wg_scan.second;
+      /* Skip all exports when possible. */
+      Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
+      max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
+
+      ngg_emit_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
+      ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
+   }

-   ngg_emit_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
-   ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
   ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0);
   ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr);
 }
@ -11440,6 +11476,8 @@ void select_program(Program *program,

      if (ngg_no_gs)
         ngg_nogs_prelude(&ctx);
+      else if (!i && ngg_gs)
+         ngg_gs_prelude(&ctx);

      /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
      nir_function_impl *func = nir_shader_get_entrypoint(nir);
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@ -94,11 +94,14 @@ struct isel_context {

   /* GS inputs */
   bool ngg_nogs_early_prim_export = false;
+   bool ngg_gs_early_alloc = false;
   Temp gs_wave_id;
   unsigned ngg_gs_emit_addr = 0;
   unsigned ngg_gs_emit_vtx_bytes = 0;
   unsigned ngg_gs_scratch_addr = 0;
   unsigned ngg_gs_primflags_offset = 0;
+   int ngg_gs_const_vtxcnt[4];
+   int ngg_gs_const_prmcnt[4];

   /* VS output information */
   bool export_clip_dists;
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -516,6 +516,10 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)

      /* Make sure we have enough room for emitted GS vertices */
      assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0);
+
+      /* See if the number of vertices and primitives are compile-time known */
+      nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u);
+      ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
   }

   if (ctx->stage & sw_vs)