From 5b4e41e4db744e837998b482e3f53e0006946009 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 1 Apr 2022 19:51:55 +0100 Subject: [PATCH] aco: don't use v_mad_mix on GFX9 if 16-bit denormals must be preserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This probably effectively disables the v_mad_mix optimization on GFX9. fossil-db (Vega): Totals from 11545 (7.15% of 161366) affected shaders: MaxWaves: 43025 -> 42780 (-0.57%); split: +0.06%, -0.63% Instrs: 18571635 -> 18734201 (+0.88%); split: -0.00%, +0.88% CodeSize: 96483568 -> 96611012 (+0.13%); split: -0.11%, +0.24% SGPRs: 1079056 -> 1077616 (-0.13%); split: -0.14%, +0.01% VGPRs: 819248 -> 821868 (+0.32%); split: -0.04%, +0.36% SpillSGPRs: 13313 -> 12464 (-6.38%) Latency: 293804093 -> 295046122 (+0.42%); split: -0.09%, +0.51% InvThroughput: 110002239 -> 110994978 (+0.90%); split: -0.03%, +0.93% VClause: 342458 -> 342596 (+0.04%); split: -0.12%, +0.16% SClause: 648566 -> 648046 (-0.08%); split: -0.12%, +0.04% Copies: 1728225 -> 1726679 (-0.09%); split: -0.66%, +0.57% Branches: 552973 -> 552963 (-0.00%); split: -0.02%, +0.02% PreSGPRs: 862360 -> 856820 (-0.64%); split: -0.69%, +0.05% PreVGPRs: 773689 -> 776818 (+0.40%); split: -0.02%, +0.42% Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6178 Part-of: --- src/amd/compiler/aco_optimizer.cpp | 4 ++++ src/amd/compiler/tests/test_optimizer.cpp | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5ac2ebc7335..b0d08774379 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3473,6 +3473,10 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr& instr) if (ctx.program->chip_class < GFX9) return false; + /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */ + if (ctx.program->chip_class == GFX9 && ctx.fp_mode.denorm16_64) + return false; + switch (instr->opcode) { case aco_opcode::v_add_f32: case aco_opcode::v_sub_f32: diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 3f1da2dbd9a..f4ff665143f 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1164,6 +1164,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1197,6 +1199,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.precision) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1248,6 +1252,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.modifiers) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1341,6 +1347,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1381,6 +1389,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.precision) if (!setup_cs("v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a16 = inputs[0]; //! v2b: %res0_tmp = v_mul_f16 %a16, %a16 @@ -1403,6 +1413,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.modifiers) if (!setup_cs("v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp a16 = inputs[2]; @@ -1450,6 +1462,8 @@ BEGIN_TEST(optimize.mad_mix.fma.basic) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1502,6 +1516,8 @@ BEGIN_TEST(optimize.mad_mix.fma.precision) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1562,6 +1578,8 @@ BEGIN_TEST(optimize.mad_mix.clamp) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1587,6 +1605,8 @@ BEGIN_TEST(optimize.mad_mix.cast) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1];