diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 0bfce41101e..52436cd9e2d 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -12,17 +12,17 @@ #include #include -#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) -#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) +#define SMEM_WINDOW_SIZE (256 - ctx.occupancy_factor * 16) +#define VMEM_WINDOW_SIZE (1024 - ctx.occupancy_factor * 64) #define LDS_WINDOW_SIZE 64 #define POS_EXP_WINDOW_SIZE 512 -#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) -#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) +#define SMEM_MAX_MOVES (128 - ctx.occupancy_factor * 8) +#define VMEM_MAX_MOVES (256 - ctx.occupancy_factor * 16) #define LDSDIR_MAX_MOVES 10 #define LDS_MAX_MOVES 32 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ -#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2) -#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4) +#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 2) +#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.occupancy_factor * 4) #define POS_EXP_MAX_MOVES 512 namespace aco { @@ -115,7 +115,7 @@ struct MoveState { struct sched_ctx { amd_gfx_level gfx_level; - int16_t num_waves; + int16_t occupancy_factor; int16_t last_SMEM_stall; int last_SMEM_dep_idx; MoveState mv; @@ -745,7 +745,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx) /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves * to help create more vmem clauses */ if ((candidate->isVMEM() || candidate->isFlatLike()) && - (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || + (cursor.insert_idx - cursor.source_idx > (ctx.occupancy_factor * 4) || current->operands[0].size() == 4)) break; /* don't move descriptor loads below buffer loads */ @@ -847,7 +847,7 @@ schedule_SMEM(sched_ctx& ctx, Block* block, Instruction* current, int idx) } ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0; - ctx.last_SMEM_stall = 10 - ctx.num_waves - k; + ctx.last_SMEM_stall = 10 - ctx.occupancy_factor - k; } void @@ -1254,28 +1254,24 @@ schedule_program(Program* program) ctx.mv.depends_on.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId()); - /* Allowing the scheduler to reduce the number of waves to as low as 5 - * improves performance of Thrones of Britannia significantly and doesn't - * seem to hurt anything else. */ - unsigned wave_fac = program->dev.physical_vgprs / 256; - if (program->num_waves <= 5 * wave_fac) - ctx.num_waves = program->num_waves; - else if (demand.vgpr >= 29) - ctx.num_waves = 5 * wave_fac; - else if (demand.vgpr >= 25) - ctx.num_waves = 6 * wave_fac; - else - ctx.num_waves = 7 * wave_fac; - ctx.num_waves = std::max(ctx.num_waves, program->min_waves); - ctx.num_waves = std::min(ctx.num_waves, program->num_waves); - ctx.num_waves = max_suitable_waves(program, ctx.num_waves); - assert(ctx.num_waves >= program->min_waves); - ctx.mv.max_registers = get_addr_regs_from_waves(program, ctx.num_waves); + const int wave_factor = program->gfx_level >= GFX10 ? 2 : 1; + const int wave_minimum = std::max(program->min_waves, 4 * wave_factor); + const float reg_file_multiple = program->dev.physical_vgprs / (256.0 * wave_factor); + + /* If we already have less waves than the minimum, don't reduce them further. + * Otherwise, sacrifice some waves and use more VGPRs, in order to improve scheduling. + */ + int vgpr_demand = std::max(24, demand.vgpr) + 12 * reg_file_multiple; + int target_waves = std::max(wave_minimum, program->dev.physical_vgprs / vgpr_demand); + target_waves = max_suitable_waves(program, std::min(program->num_waves, target_waves)); + assert(target_waves >= program->min_waves); + + ctx.mv.max_registers = get_addr_regs_from_waves(program, target_waves); ctx.mv.max_registers.vgpr -= 2; /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ - ctx.num_waves = std::max(ctx.num_waves / wave_fac, 1); + ctx.occupancy_factor = target_waves / wave_factor; /* NGG culling shaders are very sensitive to position export scheduling. * Schedule less aggressively when early primitive export is used, and