From 6ea9443726fff6921d548faae3c1afa8cedc6a4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Wed, 24 Jul 2024 11:48:47 +0200 Subject: [PATCH] aco/scheduler: stop rounding down the target number of waves on GFX10+ This way, it can make use of uneven wave numbers. Totals from 4078 (5.14% of 79395) affected shaders: (Navi21) MaxWaves: 58715 -> 65460 (+11.49%); split: +11.49%, -0.01% Instrs: 5033684 -> 5048244 (+0.29%); split: -0.09%, +0.38% CodeSize: 26833884 -> 26898780 (+0.24%); split: -0.07%, +0.32% VGPRs: 302360 -> 265312 (-12.25%); split: -12.26%, +0.01% Latency: 34636448 -> 36044242 (+4.06%); split: -0.08%, +4.14% InvThroughput: 7999403 -> 7662697 (-4.21%); split: -4.55%, +0.34% VClause: 105403 -> 111996 (+6.26%); split: -0.40%, +6.66% SClause: 132996 -> 133460 (+0.35%); split: -0.81%, +1.16% Copies: 297036 -> 308122 (+3.73%); split: -0.64%, +4.37% Branches: 89376 -> 89390 (+0.02%); split: -0.00%, +0.02% VALU: 3477621 -> 3488510 (+0.31%); split: -0.05%, +0.36% SALU: 484211 -> 484191 (-0.00%); split: -0.08%, +0.08% Totals from 1840 (2.32% of 79395) affected shaders: (Navi31) MaxWaves: 30714 -> 34182 (+11.29%) Instrs: 3102955 -> 3131001 (+0.90%); split: -0.05%, +0.95% CodeSize: 16160564 -> 16273100 (+0.70%); split: -0.04%, +0.74% VGPRs: 174540 -> 150600 (-13.72%) Latency: 23521914 -> 24515055 (+4.22%); split: -0.07%, +4.29% InvThroughput: 4373397 -> 4202912 (-3.90%); split: -4.40%, +0.50% VClause: 59087 -> 64091 (+8.47%); split: -0.24%, +8.71% SClause: 74844 -> 75366 (+0.70%); split: -0.53%, +1.22% Copies: 184396 -> 197747 (+7.24%); split: -0.25%, +7.49% Branches: 46015 -> 46028 (+0.03%); split: -0.00%, +0.03% VALU: 1929286 -> 1942709 (+0.70%); split: -0.02%, +0.71% SALU: 216126 -> 215983 (-0.07%); split: -0.18%, +0.12% VOPD: 1216 -> 1217 (+0.08%); split: +1.40%, -1.32% Part-of: --- src/amd/compiler/aco_scheduler.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index dfab79f5d85..896452e9d6c 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -1246,7 +1246,6 @@ schedule_program(Program* program) /* Allowing the scheduler to reduce the number of waves to as low as 5 * improves performance of Thrones of Britannia significantly and doesn't * seem to hurt anything else. */ - // TODO: account for possible uneven num_waves on GFX10+ unsigned wave_fac = program->dev.physical_vgprs / 256; if (program->num_waves <= 5 * wave_fac) ctx.num_waves = program->num_waves; @@ -1260,17 +1259,13 @@ schedule_program(Program* program) ctx.num_waves = std::min(ctx.num_waves, program->num_waves); ctx.num_waves = max_suitable_waves(program, ctx.num_waves); + assert(ctx.num_waves >= program->min_waves); + ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))}; + /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ ctx.num_waves = std::max(ctx.num_waves / wave_fac, 1); - assert(ctx.num_waves > 0); - ctx.mv.max_registers = { - int16_t(get_addr_vgpr_from_waves( - program, std::max(ctx.num_waves * wave_fac, program->min_waves)) - - 2), - int16_t(get_addr_sgpr_from_waves( - program, std::max(ctx.num_waves * wave_fac, program->min_waves)))}; - /* NGG culling shaders are very sensitive to position export scheduling. * Schedule less aggressively when early primitive export is used, and * keep the position export at the very bottom when late primitive export is used.