mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-27 03:20:33 +01:00
aco/scheduler: improve scheduling heuristic
The heuristic we are currently using still stems from the GCN era
with the only adjustments being made for RDNA was to double (or triple)
the wave count.
This rewrite aims to detangle some concepts and provide more consistent results.
- wave_factor: The purpose of this value is to reflect that RDNA SIMDs can
accomodate twice as many waves as GCN SIMDs.
- reg_file_multiple: This value accounts for the larger register file of wave32
and some RDNA3 families.
- wave_minimum: Below this value, we don't sacrifice any waves. It corresponds
to a register demand of 64 VGPRs in wave64.
- occupancy_factor: Depending on target_waves and wave_factor, this controls
the scheduling window sizes and number of moves.
The main differences from the previous heuristic is a lower wave minimum and
a slightly less aggressive reduction of waves.
It also increases SMEM_MAX_MOVES in order to mitigate some of the changes
from targeting less waves.
Totals from 62777 (78.63% of 79839) affected shaders: (Navi48)
MaxWaves: 1880983 -> 1848028 (-1.75%); split: +0.01%, -1.76%
Instrs: 40904711 -> 40800797 (-0.25%); split: -0.39%, +0.14%
CodeSize: 217132208 -> 216748832 (-0.18%); split: -0.29%, +0.12%
VGPRs: 3019304 -> 3099596 (+2.66%); split: -0.11%, +2.77%
Latency: 268857129 -> 265951122 (-1.08%); split: -1.33%, +0.25%
InvThroughput: 40960938 -> 41044533 (+0.20%); split: -0.18%, +0.39%
VClause: 794000 -> 782913 (-1.40%); split: -2.24%, +0.84%
SClause: 1192476 -> 1150831 (-3.49%); split: -3.94%, +0.45%
Copies: 2720470 -> 2700148 (-0.75%); split: -1.84%, +1.09%
Branches: 785926 -> 785951 (+0.00%); split: -0.01%, +0.01%
VALU: 22918411 -> 22890189 (-0.12%); split: -0.19%, +0.06%
SALU: 5281201 -> 5289486 (+0.16%); split: -0.21%, +0.36%
VOPD: 8790 -> 8685 (-1.19%); split: +1.08%, -2.28%
Totals from 62081 (77.77% of 79825) affected shaders: (Navi31)
MaxWaves: 1848555 -> 1812347 (-1.96%); split: +0.01%, -1.97%
Instrs: 39794460 -> 39704180 (-0.23%); split: -0.39%, +0.16%
CodeSize: 208987052 -> 208621524 (-0.17%); split: -0.31%, +0.13%
VGPRs: 3046284 -> 3135156 (+2.92%); split: -0.11%, +3.03%
Latency: 268863465 -> 265218186 (-1.36%); split: -1.59%, +0.23%
InvThroughput: 41101515 -> 41167075 (+0.16%); split: -0.22%, +0.38%
VClause: 795316 -> 774899 (-2.57%); split: -3.17%, +0.61%
SClause: 1177294 -> 1135451 (-3.55%); split: -4.06%, +0.51%
Copies: 2743254 -> 2725127 (-0.66%); split: -1.90%, +1.24%
Branches: 801395 -> 801428 (+0.00%); split: -0.01%, +0.02%
VALU: 23898938 -> 23871294 (-0.12%); split: -0.20%, +0.08%
SALU: 3908807 -> 3919130 (+0.26%); split: -0.23%, +0.50%
VOPD: 8529 -> 8500 (-0.34%); split: +1.29%, -1.63%
Totals from 44996 (71.01% of 63370) affected shaders: (Vega10)
MaxWaves: 307074 -> 304808 (-0.74%); split: +0.63%, -1.37%
Instrs: 22743534 -> 22716240 (-0.12%); split: -0.22%, +0.10%
CodeSize: 117284856 -> 117173212 (-0.10%); split: -0.19%, +0.09%
SGPRs: 3249008 -> 3330480 (+2.51%); split: -0.36%, +2.87%
VGPRs: 1901400 -> 1943880 (+2.23%); split: -0.60%, +2.83%
Latency: 224839126 -> 222878477 (-0.87%); split: -1.19%, +0.31%
InvThroughput: 114389570 -> 114316559 (-0.06%); split: -0.17%, +0.11%
VClause: 482012 -> 473304 (-1.81%); split: -2.86%, +1.05%
SClause: 757799 -> 717092 (-5.37%); split: -5.64%, +0.27%
Copies: 2182735 -> 2183598 (+0.04%); split: -1.17%, +1.21%
Branches: 396026 -> 395996 (-0.01%); split: -0.03%, +0.02%
VALU: 16740283 -> 16728098 (-0.07%); split: -0.14%, +0.07%
SALU: 2133575 -> 2145863 (+0.58%); split: -0.29%, +0.86%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30720>
This commit is contained in:
parent
f292faebd7
commit
d3743dd7ba
1 changed files with 23 additions and 27 deletions
|
|
@ -12,17 +12,17 @@
|
|||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
|
||||
#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
|
||||
#define SMEM_WINDOW_SIZE (256 - ctx.occupancy_factor * 16)
|
||||
#define VMEM_WINDOW_SIZE (1024 - ctx.occupancy_factor * 64)
|
||||
#define LDS_WINDOW_SIZE 64
|
||||
#define POS_EXP_WINDOW_SIZE 512
|
||||
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
|
||||
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
|
||||
#define SMEM_MAX_MOVES (128 - ctx.occupancy_factor * 8)
|
||||
#define VMEM_MAX_MOVES (256 - ctx.occupancy_factor * 16)
|
||||
#define LDSDIR_MAX_MOVES 10
|
||||
#define LDS_MAX_MOVES 32
|
||||
/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
|
||||
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
|
||||
#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4)
|
||||
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 2)
|
||||
#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 4)
|
||||
#define POS_EXP_MAX_MOVES 512
|
||||
|
||||
namespace aco {
|
||||
|
|
@ -115,7 +115,7 @@ struct MoveState {
|
|||
|
||||
struct sched_ctx {
|
||||
amd_gfx_level gfx_level;
|
||||
int16_t num_waves;
|
||||
int16_t occupancy_factor;
|
||||
int16_t last_SMEM_stall;
|
||||
int last_SMEM_dep_idx;
|
||||
MoveState mv;
|
||||
|
|
@ -745,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
|
|||
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
|
||||
* to help create more vmem clauses */
|
||||
if ((candidate->isVMEM() || candidate->isFlatLike()) &&
|
||||
(cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
|
||||
(cursor.insert_idx - cursor.source_idx > (ctx.occupancy_factor * 4) ||
|
||||
current->operands[0].size() == 4))
|
||||
break;
|
||||
/* don't move descriptor loads below buffer loads */
|
||||
|
|
@ -847,7 +847,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx)
|
|||
}
|
||||
|
||||
ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0;
|
||||
ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
|
||||
ctx.last_SMEM_stall = 10 - ctx.occupancy_factor - k;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -1254,28 +1254,24 @@ schedule_program(Program* program)
|
|||
ctx.mv.depends_on.resize(program->peekAllocationId());
|
||||
ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
|
||||
ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
|
||||
/* Allowing the scheduler to reduce the number of waves to as low as 5
|
||||
* improves performance of Thrones of Britannia significantly and doesn't
|
||||
* seem to hurt anything else. */
|
||||
unsigned wave_fac = program->dev.physical_vgprs / 256;
|
||||
if (program->num_waves <= 5 * wave_fac)
|
||||
ctx.num_waves = program->num_waves;
|
||||
else if (demand.vgpr >= 29)
|
||||
ctx.num_waves = 5 * wave_fac;
|
||||
else if (demand.vgpr >= 25)
|
||||
ctx.num_waves = 6 * wave_fac;
|
||||
else
|
||||
ctx.num_waves = 7 * wave_fac;
|
||||
ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
|
||||
ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
|
||||
ctx.num_waves = max_suitable_waves(program, ctx.num_waves);
|
||||
|
||||
assert(ctx.num_waves >= program->min_waves);
|
||||
ctx.mv.max_registers = get_addr_regs_from_waves(program, ctx.num_waves);
|
||||
const int wave_factor = program->gfx_level >= GFX10 ? 2 : 1;
|
||||
const int wave_minimum = std::max<int>(program->min_waves, 4 * wave_factor);
|
||||
const float reg_file_multiple = program->dev.physical_vgprs / (256.0 * wave_factor);
|
||||
|
||||
/* If we already have less waves than the minimum, don't reduce them further.
|
||||
* Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling.
|
||||
*/
|
||||
int vgpr_demand = std::max<int>(24, demand.vgpr) + 12 * reg_file_multiple;
|
||||
int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand);
|
||||
target_waves = max_suitable_waves(program, std::min<int>(program->num_waves, target_waves));
|
||||
assert(target_waves >= program->min_waves);
|
||||
|
||||
ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves);
|
||||
ctx.mv.max_registers.vgpr -= 2;
|
||||
|
||||
/* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
|
||||
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
|
||||
ctx.occupancy_factor = target_waves / wave_factor;
|
||||
|
||||
/* NGG culling shaders are very sensitive to position export scheduling.
|
||||
* Schedule less aggressively when early primitive export is used, and
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue