aco/ngg: Incorporate GS invocations into workgroup size calculation.

If the workgroup_size variable is lower than the actual workgroup size, that means it's possible that ACO won't emit some s_barrier instructions when in fact it should. This can possibly cause a GPU hang. This is just for the sake of general correctness, currently this can't cause a real problem because the maximum vertex count is always greater than (or equal to) the primitive count in GS, and already takes into account the number of GS invocations. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7232>
2025-12-26 08:30:10 +01:00 · 2020-10-15 10:33:18 +02:00 · 2020-10-15 10:33:18 +02:00 · f74ef15879
commit f74ef15879
parent 09b9e52c0d
1 changed files with 4 additions and 3 deletions
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -1129,15 +1129,16 @@ setup_isel_context(Program* program,
      program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
   } else if (program->stage.hw == HWStage::NGG) {
      gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;
+      unsigned num_gs_invocations = (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;

-      /* Max ES (SW VS) threads */
+      /* Max ES (SW VS/TES) threads */
      uint32_t max_esverts = ngg_info.hw_max_esverts;
      /* Max GS input primitives = max GS threads */
-      uint32_t max_gs_input_prims = ngg_info.max_gsprims;
+      uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;
      /* Maximum output vertices -- each thread can export only 1 vertex */
      uint32_t max_out_vtx = ngg_info.max_out_verts;
      /* Maximum output primitives -- each thread can export only 1 or 0 primitive */
-      uint32_t max_out_prm = ngg_info.max_gsprims * ngg_info.prim_amp_factor;
+      uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;

      program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);
   } else {