mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 15:20:10 +01:00
aco: fix num_waves on GFX10+
There are half the SIMDs per CU and physical_vgprs should be 512 instead of 256. fossil-db (GFX10.3): Totals from 3622 (2.60% of 139391) affected shaders: VGPRs: 298192 -> 289732 (-2.84%); split: -3.43%, +0.59% CodeSize: 29443432 -> 29458388 (+0.05%); split: -0.00%, +0.06% MaxWaves: 21703 -> 23395 (+7.80%); split: +7.84%, -0.05% Instrs: 5677920 -> 5681438 (+0.06%); split: -0.01%, +0.07% Cycles: 280715524 -> 280895676 (+0.06%); split: -0.00%, +0.07% VMEM: 981142 -> 981894 (+0.08%); split: +0.18%, -0.10% SMEM: 243315 -> 243454 (+0.06%); split: +0.07%, -0.02% VClause: 88991 -> 89767 (+0.87%); split: -0.02%, +0.89% SClause: 200660 -> 200659 (-0.00%); split: -0.00%, +0.00% Copies: 430729 -> 434160 (+0.80%); split: -0.07%, +0.86% Branches: 158004 -> 158021 (+0.01%); split: -0.01%, +0.02% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8523>
This commit is contained in:
parent
12ea0143de
commit
489aa8c7cb
4 changed files with 22 additions and 13 deletions
|
|
@ -99,10 +99,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
||||
|
||||
program->vgpr_limit = 256;
|
||||
program->physical_vgprs = 256;
|
||||
program->vgpr_alloc_granule = 3;
|
||||
|
||||
if (chip_class >= GFX10) {
|
||||
program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
|
||||
program->physical_vgprs = 512;
|
||||
program->sgpr_alloc_granule = 127;
|
||||
program->sgpr_limit = 106;
|
||||
if (chip_class >= GFX10_3)
|
||||
|
|
|
|||
|
|
@ -1735,6 +1735,7 @@ public:
|
|||
uint16_t vgpr_limit;
|
||||
uint16_t sgpr_limit;
|
||||
uint16_t physical_sgprs;
|
||||
uint16_t physical_vgprs;
|
||||
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
|
||||
uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
|
||||
unsigned workgroup_size; /* if known; otherwise UINT_MAX */
|
||||
|
|
|
|||
|
|
@ -314,7 +314,7 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
|
|||
|
||||
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
|
||||
{
|
||||
uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule;
|
||||
uint16_t vgprs = program->physical_vgprs / max_waves & ~program->vgpr_alloc_granule;
|
||||
return std::min(vgprs, program->vgpr_limit);
|
||||
}
|
||||
|
||||
|
|
@ -325,7 +325,7 @@ void calc_min_waves(Program* program)
|
|||
if (program->wave_size == 32)
|
||||
waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);
|
||||
|
||||
unsigned simd_per_cu = 4; /* TODO: different on Navi */
|
||||
unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
|
||||
bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
|
||||
unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
|
||||
|
||||
|
|
@ -334,11 +334,12 @@ void calc_min_waves(Program* program)
|
|||
|
||||
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
||||
{
|
||||
/* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
|
||||
unsigned max_waves_per_simd = 10;
|
||||
if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) || program->chip_class >= GFX10_3)
|
||||
unsigned max_waves_per_simd = program->chip_class == GFX10 ? 20 : 10;
|
||||
if (program->chip_class >= GFX10_3)
|
||||
max_waves_per_simd = 16;
|
||||
else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
|
||||
max_waves_per_simd = 8;
|
||||
unsigned simd_per_cu = 4;
|
||||
unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
|
||||
|
||||
bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
|
||||
unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
|
||||
|
|
@ -350,7 +351,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|||
program->max_reg_demand = new_demand;
|
||||
} else {
|
||||
program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr));
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr));
|
||||
program->max_waves = max_waves_per_simd;
|
||||
|
||||
/* adjust max_waves for workgroup and LDS limits */
|
||||
|
|
|
|||
|
|
@ -902,20 +902,25 @@ void schedule_program(Program *program, live& live_vars)
|
|||
/* Allowing the scheduler to reduce the number of waves to as low as 5
|
||||
* improves performance of Thrones of Britannia significantly and doesn't
|
||||
* seem to hurt anything else. */
|
||||
if (program->num_waves <= 5)
|
||||
//TODO: account for possible uneven num_waves on GFX10+
|
||||
unsigned wave_fac = program->physical_vgprs / 256;
|
||||
if (program->num_waves <= 5 * wave_fac)
|
||||
ctx.num_waves = program->num_waves;
|
||||
else if (demand.vgpr >= 29)
|
||||
ctx.num_waves = 5;
|
||||
ctx.num_waves = 5 * wave_fac;
|
||||
else if (demand.vgpr >= 25)
|
||||
ctx.num_waves = 6;
|
||||
ctx.num_waves = 6 * wave_fac;
|
||||
else
|
||||
ctx.num_waves = 7;
|
||||
ctx.num_waves = 7 * wave_fac;
|
||||
ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
|
||||
ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves);
|
||||
|
||||
/* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */
|
||||
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
|
||||
|
||||
assert(ctx.num_waves > 0);
|
||||
ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
|
||||
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
|
||||
ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
|
||||
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
|
||||
|
||||
for (Block& block : program->blocks)
|
||||
schedule_block(ctx, program, &block, live_vars);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue