From d3743dd7ba6e191c2eb15da99360460cd003ede6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Wed, 24 Jul 2024 11:51:31 +0200
Subject: [PATCH] aco/scheduler: improve scheduling heuristic

The heuristic we are currently using still stems from the GCN era
with the only adjustments being made for RDNA was to double (or triple)
the wave count.

This rewrite aims to detangle some concepts and provide more consistent results.

 - wave_factor: The purpose of this value is to reflect that RDNA SIMDs can
                accomodate twice as many waves as GCN SIMDs.
 - reg_file_multiple: This value accounts for the larger register file of wave32
                      and some RDNA3 families.
 - wave_minimum: Below this value, we don't sacrifice any waves. It corresponds
                 to a register demand of 64 VGPRs in wave64.
 - occupancy_factor: Depending on target_waves and wave_factor, this controls
                     the scheduling window sizes and number of moves.

The main differences from the previous heuristic is a lower wave minimum and
a slightly less aggressive reduction of waves.
It also increases SMEM_MAX_MOVES in order to mitigate some of the changes
from targeting less waves.

Totals from 62777 (78.63% of 79839) affected shaders: (Navi48)
MaxWaves: 1880983 -> 1848028 (-1.75%); split: +0.01%, -1.76%
Instrs: 40904711 -> 40800797 (-0.25%); split: -0.39%, +0.14%
CodeSize: 217132208 -> 216748832 (-0.18%); split: -0.29%, +0.12%
VGPRs: 3019304 -> 3099596 (+2.66%); split: -0.11%, +2.77%
Latency: 268857129 -> 265951122 (-1.08%); split: -1.33%, +0.25%
InvThroughput: 40960938 -> 41044533 (+0.20%); split: -0.18%, +0.39%
VClause: 794000 -> 782913 (-1.40%); split: -2.24%, +0.84%
SClause: 1192476 -> 1150831 (-3.49%); split: -3.94%, +0.45%
Copies: 2720470 -> 2700148 (-0.75%); split: -1.84%, +1.09%
Branches: 785926 -> 785951 (+0.00%); split: -0.01%, +0.01%
VALU: 22918411 -> 22890189 (-0.12%); split: -0.19%, +0.06%
SALU: 5281201 -> 5289486 (+0.16%); split: -0.21%, +0.36%
VOPD: 8790 -> 8685 (-1.19%); split: +1.08%, -2.28%

Totals from 62081 (77.77% of 79825) affected shaders: (Navi31)
MaxWaves: 1848555 -> 1812347 (-1.96%); split: +0.01%, -1.97%
Instrs: 39794460 -> 39704180 (-0.23%); split: -0.39%, +0.16%
CodeSize: 208987052 -> 208621524 (-0.17%); split: -0.31%, +0.13%
VGPRs: 3046284 -> 3135156 (+2.92%); split: -0.11%, +3.03%
Latency: 268863465 -> 265218186 (-1.36%); split: -1.59%, +0.23%
InvThroughput: 41101515 -> 41167075 (+0.16%); split: -0.22%, +0.38%
VClause: 795316 -> 774899 (-2.57%); split: -3.17%, +0.61%
SClause: 1177294 -> 1135451 (-3.55%); split: -4.06%, +0.51%
Copies: 2743254 -> 2725127 (-0.66%); split: -1.90%, +1.24%
Branches: 801395 -> 801428 (+0.00%); split: -0.01%, +0.02%
VALU: 23898938 -> 23871294 (-0.12%); split: -0.20%, +0.08%
SALU: 3908807 -> 3919130 (+0.26%); split: -0.23%, +0.50%
VOPD: 8529 -> 8500 (-0.34%); split: +1.29%, -1.63%

Totals from 44996 (71.01% of 63370) affected shaders: (Vega10)

MaxWaves: 307074 -> 304808 (-0.74%); split: +0.63%, -1.37%
Instrs: 22743534 -> 22716240 (-0.12%); split: -0.22%, +0.10%
CodeSize: 117284856 -> 117173212 (-0.10%); split: -0.19%, +0.09%
SGPRs: 3249008 -> 3330480 (+2.51%); split: -0.36%, +2.87%
VGPRs: 1901400 -> 1943880 (+2.23%); split: -0.60%, +2.83%
Latency: 224839126 -> 222878477 (-0.87%); split: -1.19%, +0.31%
InvThroughput: 114389570 -> 114316559 (-0.06%); split: -0.17%, +0.11%
VClause: 482012 -> 473304 (-1.81%); split: -2.86%, +1.05%
SClause: 757799 -> 717092 (-5.37%); split: -5.64%, +0.27%
Copies: 2182735 -> 2183598 (+0.04%); split: -1.17%, +1.21%
Branches: 396026 -> 395996 (-0.01%); split: -0.03%, +0.02%
VALU: 16740283 -> 16728098 (-0.07%); split: -0.14%, +0.07%
SALU: 2133575 -> 2145863 (+0.58%); split: -0.29%, +0.86%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30720>
---
 src/amd/compiler/aco_scheduler.cpp | 50 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 0bfce41101e..52436cd9e2d 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -12,17 +12,17 @@
 #include <algorithm>
 #include <vector>
 
-#define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
-#define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
+#define SMEM_WINDOW_SIZE    (256 - ctx.occupancy_factor * 16)
+#define VMEM_WINDOW_SIZE    (1024 - ctx.occupancy_factor * 64)
 #define LDS_WINDOW_SIZE     64
 #define POS_EXP_WINDOW_SIZE 512
-#define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
-#define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
+#define SMEM_MAX_MOVES      (128 - ctx.occupancy_factor * 8)
+#define VMEM_MAX_MOVES      (256 - ctx.occupancy_factor * 16)
 #define LDSDIR_MAX_MOVES    10
 #define LDS_MAX_MOVES       32
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
-#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
-#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4)
+#define VMEM_CLAUSE_MAX_GRAB_DIST       (ctx.occupancy_factor * 2)
+#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 4)
 #define POS_EXP_MAX_MOVES         512
 
 namespace aco {
@@ -115,7 +115,7 @@ struct MoveState {
 
 struct sched_ctx {
    amd_gfx_level gfx_level;
-   int16_t num_waves;
+   int16_t occupancy_factor;
    int16_t last_SMEM_stall;
    int last_SMEM_dep_idx;
    MoveState mv;
@@ -745,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
       /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
        * to help create more vmem clauses */
       if ((candidate->isVMEM() || candidate->isFlatLike()) &&
-          (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+          (cursor.insert_idx - cursor.source_idx > (ctx.occupancy_factor * 4) ||
            current->operands[0].size() == 4))
          break;
       /* don't move descriptor loads below buffer loads */
@@ -847,7 +847,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
    }
 
    ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
-   ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
+   ctx.last_SMEM_stall = 10 - ctx.occupancy_factor - k;
 }
 
 void
@@ -1254,28 +1254,24 @@ schedule_program(Program* program)
    ctx.mv.depends_on.resize(program->peekAllocationId());
    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
-   /* Allowing the scheduler to reduce the number of waves to as low as 5
-    * improves performance of Thrones of Britannia significantly and doesn't
-    * seem to hurt anything else. */
-   unsigned wave_fac = program->dev.physical_vgprs / 256;
-   if (program->num_waves <= 5 * wave_fac)
-      ctx.num_waves = program->num_waves;
-   else if (demand.vgpr >= 29)
-      ctx.num_waves = 5 * wave_fac;
-   else if (demand.vgpr >= 25)
-      ctx.num_waves = 6 * wave_fac;
-   else
-      ctx.num_waves = 7 * wave_fac;
-   ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
-   ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
-   ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
 
-   assert(ctx.num_waves >= program->min_waves);
-   ctx.mv.max_registers = get_addr_regs_from_waves(program, ctx.num_waves);
+   const int wave_factor = program->gfx_level >= GFX10 ? 2 : 1;
+   const int wave_minimum = std::max<int>(program->min_waves, 4 * wave_factor);
+   const float reg_file_multiple = program->dev.physical_vgprs / (256.0 * wave_factor);
+
+   /* If we already have less waves than the minimum, don't reduce them further.
+    * Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling.
+    */
+   int vgpr_demand = std::max<int>(24, demand.vgpr) + 12 * reg_file_multiple;
+   int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand);
+   target_waves = max_suitable_waves(program, std::min<int>(program->num_waves, target_waves));
+   assert(target_waves >= program->min_waves);
+
+   ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves);
    ctx.mv.max_registers.vgpr -= 2;
 
    /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
-   ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
+   ctx.occupancy_factor = target_waves / wave_factor;
 
    /* NGG culling shaders are very sensitive to position export scheduling.
     * Schedule less aggressively when early primitive export is used, and