aco/gfx11: increase gfx1100/gfx1101 physical vgprs

https://reviews.llvm.org/D134522

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18825>
This commit is contained in:
Rhys Perry 2022-09-26 17:18:48 +01:00 committed by Marge Bot
parent 67ebe86f0c
commit 50073d6135
5 changed files with 18 additions and 10 deletions

View file

@ -12563,7 +12563,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade
program->config->float_mode = program->blocks[0].fp_mode.val;
/* addition on GFX6-8 requires a carry-out (we use VCC) */
program->needs_vcc = program->gfx_level <= GFX8;
program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
}

View file

@ -111,14 +111,20 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
if (gfx_level >= GFX10) {
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
program->dev.sgpr_alloc_granule = 128;
program->dev.sgpr_limit =
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
if (gfx_level == GFX10_3)
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
else
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
if (family == CHIP_GFX1100 || family == CHIP_GFX1101) {
program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
} else {
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
if (gfx_level >= GFX10_3)
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
else
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
}
} else if (program->gfx_level >= GFX8) {
program->dev.physical_sgprs = 800;
program->dev.sgpr_alloc_granule = 16;

View file

@ -2134,7 +2134,7 @@ struct DeviceInfo {
uint16_t vgpr_limit;
uint16_t sgpr_limit;
uint16_t sgpr_alloc_granule;
uint16_t vgpr_alloc_granule; /* must be power of two */
uint16_t vgpr_alloc_granule;
unsigned max_wave64_per_simd;
unsigned simd_per_cu;
bool has_fast_fma32 = false;

View file

@ -348,7 +348,7 @@ get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
{
assert(addressable_vgprs <= program->dev.vgpr_limit);
uint16_t granule = program->dev.vgpr_alloc_granule;
return align(std::max(addressable_vgprs, granule), granule);
return ALIGN_NPOT(std::max(addressable_vgprs, granule), granule);
}
unsigned
@ -370,7 +370,8 @@ get_addr_sgpr_from_waves(Program* program, uint16_t waves)
uint16_t
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
{
uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
uint16_t vgprs = program->dev.physical_vgprs / waves;
vgprs = vgprs / program->dev.vgpr_alloc_granule * program->dev.vgpr_alloc_granule;
vgprs -= program->config->num_shared_vgprs / 2;
return std::min(vgprs, program->dev.vgpr_limit);
}

View file

@ -3149,7 +3149,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
} /* end for BB */
/* num_gpr = rnd_up(max_used_gpr + 1) */
program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
program->config->num_vgprs =
std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
program->progress = CompilationProgress::after_ra;