radv, aco, ac/nir: Tweak position export scheduling for NGG culling.

The result is about +5-ish fps in Doom Eternal. It turns out that the location of position exports matters more than we thought, and it's actually better to keep them at the bottom for culling shaders rather than schedule it up to the top. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
2026-05-05 13:58:04 +02:00 · 2021-07-05 15:26:18 +02:00 · 2021-07-05 15:26:18 +02:00 · 8341af5109
commit 8341af5109
parent 0bb543bb60
5 changed files with 20 additions and 3 deletions
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@ -96,6 +96,7 @@ typedef struct
   unsigned lds_bytes_if_culling_off;
   bool can_cull;
   bool passthrough;
+   bool early_prim_export;
   uint64_t nggc_inputs_read_by_pos;
   uint64_t nggc_inputs_read_by_others;
 } ac_nir_ngg_config;
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@ -1290,6 +1290,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
      .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
      .can_cull = can_cull,
      .passthrough = passthrough,
+      .early_prim_export = state.early_prim_export,
      .nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
      .nggc_inputs_read_by_others = state.inputs_needed_by_others,
   };
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -126,6 +126,8 @@ struct sched_ctx {
   int16_t last_SMEM_stall;
   int last_SMEM_dep_idx;
   MoveState mv;
+   bool schedule_pos_exports = true;
+   unsigned schedule_pos_export_div = 1;
 };

 /* This scheduler is a simple bottom-up pass based on ideas from
@ -928,8 +930,8 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
                         Instruction* current, int idx)
 {
   assert(idx != 0);
-   int window_size = POS_EXP_WINDOW_SIZE;
-   int max_moves = POS_EXP_MAX_MOVES;
+   int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
+   int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
   int16_t k = 0;

   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
@ -982,7 +984,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
   for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
      Instruction* current = block->instructions[idx].get();

-      if (block->kind & block_kind_export_end && current->isEXP()) {
+      if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
         unsigned target = current->exp().dest;
         if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
            ctx.mv.current = current;
@ -1048,6 +1050,17 @@ schedule_program(Program* program, live& live_vars)
   ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
                           int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};

+   /* NGG culling shaders are very sensitive to position export scheduling.
+    * Schedule less aggressively when early primitive export is used, and
+    * keep the position export at the very bottom when late primitive export is used.
+    */
+   if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
+      if (!program->info->has_ngg_early_prim_export)
+         ctx.schedule_pos_exports = false;
+      else
+         ctx.schedule_pos_export_div = 4;
+   }
+
   for (Block& block : program->blocks)
      schedule_block(ctx, program, &block, live_vars);

--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -1017,6 +1017,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
            key->vs.provoking_vtx_last);

      info->has_ngg_culling = out_conf.can_cull;
+      info->has_ngg_early_prim_export = out_conf.early_prim_export;
      info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
      info->is_ngg_passthrough = out_conf.passthrough;
      key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -264,6 +264,7 @@ struct radv_shader_info {
   bool is_ngg;
   bool is_ngg_passthrough;
   bool has_ngg_culling;
+   bool has_ngg_early_prim_export;
   uint32_t num_lds_blocks_when_not_culling;
   uint32_t num_tess_patches;
   struct {