diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5ac2ebc7335..b0d08774379 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3473,6 +3473,10 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr& instr) if (ctx.program->chip_class < GFX9) return false; + /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */ + if (ctx.program->chip_class == GFX9 && ctx.fp_mode.denorm16_64) + return false; + switch (instr->opcode) { case aco_opcode::v_add_f32: case aco_opcode::v_sub_f32: diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 3f1da2dbd9a..f4ff665143f 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1164,6 +1164,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1197,6 +1199,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.precision) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1248,6 +1252,8 @@ BEGIN_TEST(optimize.mad_mix.input_conv.modifiers) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1341,6 +1347,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1381,6 +1389,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.precision) if (!setup_cs("v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a16 = inputs[0]; //! v2b: %res0_tmp = v_mul_f16 %a16, %a16 @@ -1403,6 +1413,8 @@ BEGIN_TEST(optimize.mad_mix.output_conv.modifiers) if (!setup_cs("v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp a16 = inputs[2]; @@ -1450,6 +1462,8 @@ BEGIN_TEST(optimize.mad_mix.fma.basic) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1502,6 +1516,8 @@ BEGIN_TEST(optimize.mad_mix.fma.precision) if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp b = inputs[1]; Temp c = inputs[2]; @@ -1562,6 +1578,8 @@ BEGIN_TEST(optimize.mad_mix.clamp) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1]; @@ -1587,6 +1605,8 @@ BEGIN_TEST(optimize.mad_mix.cast) if (!setup_cs("v1 v2b", (chip_class)i)) continue; + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + Temp a = inputs[0]; Temp a16 = inputs[1];