aco/scheduler: improve scheduling heuristic

The heuristic we are currently using still stems from the GCN era with the only adjustments being made for RDNA was to double (or triple) the wave count. This rewrite aims to detangle some concepts and provide more consistent results. - wave_factor: The purpose of this value is to reflect that RDNA SIMDs can accomodate twice as many waves as GCN SIMDs. - reg_file_multiple: This value accounts for the larger register file of wave32 and some RDNA3 families. - wave_minimum: Below this value, we don't sacrifice any waves. It corresponds to a register demand of 64 VGPRs in wave64. - occupancy_factor: Depending on target_waves and wave_factor, this controls the scheduling window sizes and number of moves. The main differences from the previous heuristic is a lower wave minimum and a slightly less aggressive reduction of waves. It also increases SMEM_MAX_MOVES in order to mitigate some of the changes from targeting less waves. Totals from 62777 (78.63% of 79839) affected shaders: (Navi48) MaxWaves: 1880983 -> 1848028 (-1.75%); split: +0.01%, -1.76% Instrs: 40904711 -> 40800797 (-0.25%); split: -0.39%, +0.14% CodeSize: 217132208 -> 216748832 (-0.18%); split: -0.29%, +0.12% VGPRs: 3019304 -> 3099596 (+2.66%); split: -0.11%, +2.77% Latency: 268857129 -> 265951122 (-1.08%); split: -1.33%, +0.25% InvThroughput: 40960938 -> 41044533 (+0.20%); split: -0.18%, +0.39% VClause: 794000 -> 782913 (-1.40%); split: -2.24%, +0.84% SClause: 1192476 -> 1150831 (-3.49%); split: -3.94%, +0.45% Copies: 2720470 -> 2700148 (-0.75%); split: -1.84%, +1.09% Branches: 785926 -> 785951 (+0.00%); split: -0.01%, +0.01% VALU: 22918411 -> 22890189 (-0.12%); split: -0.19%, +0.06% SALU: 5281201 -> 5289486 (+0.16%); split: -0.21%, +0.36% VOPD: 8790 -> 8685 (-1.19%); split: +1.08%, -2.28% Totals from 62081 (77.77% of 79825) affected shaders: (Navi31) MaxWaves: 1848555 -> 1812347 (-1.96%); split: +0.01%, -1.97% Instrs: 39794460 -> 39704180 (-0.23%); split: -0.39%, +0.16% CodeSize: 208987052 -> 208621524 (-0.17%); split: -0.31%, +0.13% VGPRs: 3046284 -> 3135156 (+2.92%); split: -0.11%, +3.03% Latency: 268863465 -> 265218186 (-1.36%); split: -1.59%, +0.23% InvThroughput: 41101515 -> 41167075 (+0.16%); split: -0.22%, +0.38% VClause: 795316 -> 774899 (-2.57%); split: -3.17%, +0.61% SClause: 1177294 -> 1135451 (-3.55%); split: -4.06%, +0.51% Copies: 2743254 -> 2725127 (-0.66%); split: -1.90%, +1.24% Branches: 801395 -> 801428 (+0.00%); split: -0.01%, +0.02% VALU: 23898938 -> 23871294 (-0.12%); split: -0.20%, +0.08% SALU: 3908807 -> 3919130 (+0.26%); split: -0.23%, +0.50% VOPD: 8529 -> 8500 (-0.34%); split: +1.29%, -1.63% Totals from 44996 (71.01% of 63370) affected shaders: (Vega10) MaxWaves: 307074 -> 304808 (-0.74%); split: +0.63%, -1.37% Instrs: 22743534 -> 22716240 (-0.12%); split: -0.22%, +0.10% CodeSize: 117284856 -> 117173212 (-0.10%); split: -0.19%, +0.09% SGPRs: 3249008 -> 3330480 (+2.51%); split: -0.36%, +2.87% VGPRs: 1901400 -> 1943880 (+2.23%); split: -0.60%, +2.83% Latency: 224839126 -> 222878477 (-0.87%); split: -1.19%, +0.31% InvThroughput: 114389570 -> 114316559 (-0.06%); split: -0.17%, +0.11% VClause: 482012 -> 473304 (-1.81%); split: -2.86%, +1.05% SClause: 757799 -> 717092 (-5.37%); split: -5.64%, +0.27% Copies: 2182735 -> 2183598 (+0.04%); split: -1.17%, +1.21% Branches: 396026 -> 395996 (-0.01%); split: -0.03%, +0.02% VALU: 16740283 -> 16728098 (-0.07%); split: -0.14%, +0.07% SALU: 2133575 -> 2145863 (+0.58%); split: -0.29%, +0.86% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30720>
2026-01-27 03:20:33 +01:00 · 2024-07-24 11:51:31 +02:00 · 2024-07-24 11:51:31 +02:00 · d3743dd7ba
commit d3743dd7ba
parent f292faebd7
1 changed files with 23 additions and 27 deletions
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -12,17 +12,17 @@
 #include <algorithm>
 #include <vector>

-#define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
-#define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
+#define SMEM_WINDOW_SIZE    (256 - ctx.occupancy_factor * 16)
+#define VMEM_WINDOW_SIZE    (1024 - ctx.occupancy_factor * 64)
 #define LDS_WINDOW_SIZE     64
 #define POS_EXP_WINDOW_SIZE 512
-#define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
-#define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
+#define SMEM_MAX_MOVES      (128 - ctx.occupancy_factor * 8)
+#define VMEM_MAX_MOVES      (256 - ctx.occupancy_factor * 16)
 #define LDSDIR_MAX_MOVES    10
 #define LDS_MAX_MOVES       32
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
-#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
-#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4)
+#define VMEM_CLAUSE_MAX_GRAB_DIST       (ctx.occupancy_factor * 2)
+#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 4)
 #define POS_EXP_MAX_MOVES         512

 namespace aco {
@ -115,7 +115,7 @@ struct MoveState {

 struct sched_ctx {
   amd_gfx_level gfx_level;
-   int16_t num_waves;
+   int16_t occupancy_factor;
   int16_t last_SMEM_stall;
   int last_SMEM_dep_idx;
   MoveState mv;
@ -745,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
       * to help create more vmem clauses */
      if ((candidate->isVMEM() || candidate->isFlatLike()) &&
-          (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+          (cursor.insert_idx - cursor.source_idx > (ctx.occupancy_factor * 4) ||
           current->operands[0].size() == 4))
         break;
      /* don't move descriptor loads below buffer loads */
@ -847,7 +847,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
   }

   ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
-   ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
+   ctx.last_SMEM_stall = 10 - ctx.occupancy_factor - k;
 }

 void
@ -1254,28 +1254,24 @@ schedule_program(Program* program)
   ctx.mv.depends_on.resize(program->peekAllocationId());
   ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
   ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
-   /* Allowing the scheduler to reduce the number of waves to as low as 5
-    * improves performance of Thrones of Britannia significantly and doesn't
-    * seem to hurt anything else. */
-   unsigned wave_fac = program->dev.physical_vgprs / 256;
-   if (program->num_waves <= 5 * wave_fac)
-      ctx.num_waves = program->num_waves;
-   else if (demand.vgpr >= 29)
-      ctx.num_waves = 5 * wave_fac;
-   else if (demand.vgpr >= 25)
-      ctx.num_waves = 6 * wave_fac;
-   else
-      ctx.num_waves = 7 * wave_fac;
-   ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
-   ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
-   ctx.num_waves = max_suitable_waves(program, ctx.num_waves);

-   assert(ctx.num_waves >= program->min_waves);
-   ctx.mv.max_registers = get_addr_regs_from_waves(program, ctx.num_waves);
+   const int wave_factor = program->gfx_level >= GFX10 ? 2 : 1;
+   const int wave_minimum = std::max<int>(program->min_waves, 4 * wave_factor);
+   const float reg_file_multiple = program->dev.physical_vgprs / (256.0 * wave_factor);
+
+   /* If we already have less waves than the minimum, don't reduce them further.
+    * Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling.
+    */
+   int vgpr_demand = std::max<int>(24, demand.vgpr) + 12 * reg_file_multiple;
+   int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand);
+   target_waves = max_suitable_waves(program, std::min<int>(program->num_waves, target_waves));
+   assert(target_waves >= program->min_waves);
+
+   ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves);
   ctx.mv.max_registers.vgpr -= 2;

   /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
-   ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
+   ctx.occupancy_factor = target_waves / wave_factor;

   /* NGG culling shaders are very sensitive to position export scheduling.
    * Schedule less aggressively when early primitive export is used, and