aco/scheduler: improve scheduling heuristic
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

The heuristic we are currently using still stems from the GCN era
with the only adjustments being made for RDNA was to double (or triple)
the wave count.

This rewrite aims to detangle some concepts and provide more consistent results.

 - wave_factor: The purpose of this value is to reflect that RDNA SIMDs can
                accomodate twice as many waves as GCN SIMDs.
 - reg_file_multiple: This value accounts for the larger register file of wave32
                      and some RDNA3 families.
 - wave_minimum: Below this value, we don't sacrifice any waves. It corresponds
                 to a register demand of 64 VGPRs in wave64.
 - occupancy_factor: Depending on target_waves and wave_factor, this controls
                     the scheduling window sizes and number of moves.

The main differences from the previous heuristic is a lower wave minimum and
a slightly less aggressive reduction of waves.
It also increases SMEM_MAX_MOVES in order to mitigate some of the changes
from targeting less waves.

Totals from 62777 (78.63% of 79839) affected shaders: (Navi48)
MaxWaves: 1880983 -> 1848028 (-1.75%); split: +0.01%, -1.76%
Instrs: 40904711 -> 40800797 (-0.25%); split: -0.39%, +0.14%
CodeSize: 217132208 -> 216748832 (-0.18%); split: -0.29%, +0.12%
VGPRs: 3019304 -> 3099596 (+2.66%); split: -0.11%, +2.77%
Latency: 268857129 -> 265951122 (-1.08%); split: -1.33%, +0.25%
InvThroughput: 40960938 -> 41044533 (+0.20%); split: -0.18%, +0.39%
VClause: 794000 -> 782913 (-1.40%); split: -2.24%, +0.84%
SClause: 1192476 -> 1150831 (-3.49%); split: -3.94%, +0.45%
Copies: 2720470 -> 2700148 (-0.75%); split: -1.84%, +1.09%
Branches: 785926 -> 785951 (+0.00%); split: -0.01%, +0.01%
VALU: 22918411 -> 22890189 (-0.12%); split: -0.19%, +0.06%
SALU: 5281201 -> 5289486 (+0.16%); split: -0.21%, +0.36%
VOPD: 8790 -> 8685 (-1.19%); split: +1.08%, -2.28%

Totals from 62081 (77.77% of 79825) affected shaders: (Navi31)
MaxWaves: 1848555 -> 1812347 (-1.96%); split: +0.01%, -1.97%
Instrs: 39794460 -> 39704180 (-0.23%); split: -0.39%, +0.16%
CodeSize: 208987052 -> 208621524 (-0.17%); split: -0.31%, +0.13%
VGPRs: 3046284 -> 3135156 (+2.92%); split: -0.11%, +3.03%
Latency: 268863465 -> 265218186 (-1.36%); split: -1.59%, +0.23%
InvThroughput: 41101515 -> 41167075 (+0.16%); split: -0.22%, +0.38%
VClause: 795316 -> 774899 (-2.57%); split: -3.17%, +0.61%
SClause: 1177294 -> 1135451 (-3.55%); split: -4.06%, +0.51%
Copies: 2743254 -> 2725127 (-0.66%); split: -1.90%, +1.24%
Branches: 801395 -> 801428 (+0.00%); split: -0.01%, +0.02%
VALU: 23898938 -> 23871294 (-0.12%); split: -0.20%, +0.08%
SALU: 3908807 -> 3919130 (+0.26%); split: -0.23%, +0.50%
VOPD: 8529 -> 8500 (-0.34%); split: +1.29%, -1.63%

Totals from 44996 (71.01% of 63370) affected shaders: (Vega10)

MaxWaves: 307074 -> 304808 (-0.74%); split: +0.63%, -1.37%
Instrs: 22743534 -> 22716240 (-0.12%); split: -0.22%, +0.10%
CodeSize: 117284856 -> 117173212 (-0.10%); split: -0.19%, +0.09%
SGPRs: 3249008 -> 3330480 (+2.51%); split: -0.36%, +2.87%
VGPRs: 1901400 -> 1943880 (+2.23%); split: -0.60%, +2.83%
Latency: 224839126 -> 222878477 (-0.87%); split: -1.19%, +0.31%
InvThroughput: 114389570 -> 114316559 (-0.06%); split: -0.17%, +0.11%
VClause: 482012 -> 473304 (-1.81%); split: -2.86%, +1.05%
SClause: 757799 -> 717092 (-5.37%); split: -5.64%, +0.27%
Copies: 2182735 -> 2183598 (+0.04%); split: -1.17%, +1.21%
Branches: 396026 -> 395996 (-0.01%); split: -0.03%, +0.02%
VALU: 16740283 -> 16728098 (-0.07%); split: -0.14%, +0.07%
SALU: 2133575 -> 2145863 (+0.58%); split: -0.29%, +0.86%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30720>
This commit is contained in:
Daniel Schürmann 2024-07-24 11:51:31 +02:00 committed by Marge Bot
parent f292faebd7
commit d3743dd7ba

View file

@ -12,17 +12,17 @@
#include <algorithm>
#include <vector>
#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
#define SMEM_WINDOW_SIZE (256 - ctx.occupancy_factor * 16)
#define VMEM_WINDOW_SIZE (1024 - ctx.occupancy_factor * 64)
#define LDS_WINDOW_SIZE 64
#define POS_EXP_WINDOW_SIZE 512
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
#define SMEM_MAX_MOVES (128 - ctx.occupancy_factor * 8)
#define VMEM_MAX_MOVES (256 - ctx.occupancy_factor * 16)
#define LDSDIR_MAX_MOVES 10
#define LDS_MAX_MOVES 32
/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4)
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 2)
#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 4)
#define POS_EXP_MAX_MOVES 512
namespace aco {
@ -115,7 +115,7 @@ struct MoveState {
struct sched_ctx {
amd_gfx_level gfx_level;
int16_t num_waves;
int16_t occupancy_factor;
int16_t last_SMEM_stall;
int last_SMEM_dep_idx;
MoveState mv;
@ -745,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
* to help create more vmem clauses */
if ((candidate->isVMEM() || candidate->isFlatLike()) &&
(cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
(cursor.insert_idx - cursor.source_idx > (ctx.occupancy_factor * 4) ||
current->operands[0].size() == 4))
break;
/* don't move descriptor loads below buffer loads */
@ -847,7 +847,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
}
ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
ctx.last_SMEM_stall = 10 - ctx.occupancy_factor - k;
}
void
@ -1254,28 +1254,24 @@ schedule_program(Program* program)
ctx.mv.depends_on.resize(program->peekAllocationId());
ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
/* Allowing the scheduler to reduce the number of waves to as low as 5
* improves performance of Thrones of Britannia significantly and doesn't
* seem to hurt anything else. */
unsigned wave_fac = program->dev.physical_vgprs / 256;
if (program->num_waves <= 5 * wave_fac)
ctx.num_waves = program->num_waves;
else if (demand.vgpr >= 29)
ctx.num_waves = 5 * wave_fac;
else if (demand.vgpr >= 25)
ctx.num_waves = 6 * wave_fac;
else
ctx.num_waves = 7 * wave_fac;
ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
assert(ctx.num_waves >= program->min_waves);
ctx.mv.max_registers = get_addr_regs_from_waves(program, ctx.num_waves);
const int wave_factor = program->gfx_level >= GFX10 ? 2 : 1;
const int wave_minimum = std::max<int>(program->min_waves, 4 * wave_factor);
const float reg_file_multiple = program->dev.physical_vgprs / (256.0 * wave_factor);
/* If we already have less waves than the minimum, don't reduce them further.
* Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling.
*/
int vgpr_demand = std::max<int>(24, demand.vgpr) + 12 * reg_file_multiple;
int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand);
target_waves = max_suitable_waves(program, std::min<int>(program->num_waves, target_waves));
assert(target_waves >= program->min_waves);
ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves);
ctx.mv.max_registers.vgpr -= 2;
/* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
ctx.occupancy_factor = target_waves / wave_factor;
/* NGG culling shaders are very sensitive to position export scheduling.
* Schedule less aggressively when early primitive export is used, and