diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 83ef05e7ebd..676a047c8b4 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -99,10 +99,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; program->vgpr_limit = 256; + program->physical_vgprs = 256; program->vgpr_alloc_granule = 3; if (chip_class >= GFX10) { program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ + program->physical_vgprs = 512; program->sgpr_alloc_granule = 127; program->sgpr_limit = 106; if (chip_class >= GFX10_3) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 69a9d977595..62b3f493fd6 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1735,6 +1735,7 @@ public: uint16_t vgpr_limit; uint16_t sgpr_limit; uint16_t physical_sgprs; + uint16_t physical_vgprs; uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ unsigned workgroup_size; /* if known; otherwise UINT_MAX */ diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index bc713a1a188..1c041c9dbbb 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -314,7 +314,7 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves) { - uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule; + uint16_t vgprs = program->physical_vgprs / max_waves & ~program->vgpr_alloc_granule; return std::min(vgprs, program->vgpr_limit); } @@ -325,7 +325,7 @@ void calc_min_waves(Program* program) if (program->wave_size == 32) waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2); - unsigned simd_per_cu = 4; /* TODO: different on Navi */ + unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4; bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */ unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; @@ -334,11 +334,12 @@ void calc_min_waves(Program* program) void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) { - /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */ - unsigned max_waves_per_simd = 10; - if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) || program->chip_class >= GFX10_3) + unsigned max_waves_per_simd = program->chip_class == GFX10 ? 20 : 10; + if (program->chip_class >= GFX10_3) + max_waves_per_simd = 16; + else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) max_waves_per_simd = 8; - unsigned simd_per_cu = 4; + unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4; bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */ unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; @@ -350,7 +351,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) program->max_reg_demand = new_demand; } else { program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); - program->num_waves = std::min(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr)); + program->num_waves = std::min(program->num_waves, program->physical_vgprs / get_vgpr_alloc(program, new_demand.vgpr)); program->max_waves = max_waves_per_simd; /* adjust max_waves for workgroup and LDS limits */ diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 0ae110f3209..fc9be0e0212 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -902,20 +902,25 @@ void schedule_program(Program *program, live& live_vars) /* Allowing the scheduler to reduce the number of waves to as low as 5 * improves performance of Thrones of Britannia significantly and doesn't * seem to hurt anything else. */ - if (program->num_waves <= 5) + //TODO: account for possible uneven num_waves on GFX10+ + unsigned wave_fac = program->physical_vgprs / 256; + if (program->num_waves <= 5 * wave_fac) ctx.num_waves = program->num_waves; else if (demand.vgpr >= 29) - ctx.num_waves = 5; + ctx.num_waves = 5 * wave_fac; else if (demand.vgpr >= 25) - ctx.num_waves = 6; + ctx.num_waves = 6 * wave_fac; else - ctx.num_waves = 7; + ctx.num_waves = 7 * wave_fac; ctx.num_waves = std::max(ctx.num_waves, program->min_waves); ctx.num_waves = std::min(ctx.num_waves, program->num_waves); + /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ + ctx.num_waves = std::max(ctx.num_waves / wave_fac, 1); + assert(ctx.num_waves > 0); - ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2), - int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))}; + ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; for (Block& block : program->blocks) schedule_block(ctx, program, &block, live_vars);