From 22dc06798b1d69e2588b755d48a21c63e17d55c2 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Fri, 31 Oct 2025 13:12:34 +0100 Subject: [PATCH] aco/optimizer: never unfuse fma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This shouldn't change anything in practice, and reducing precision if precise isn't set is weird. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 5 ++--- src/amd/compiler/tests/test_optimizer.cpp | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index faa772317fe..73724495e14 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -880,8 +880,7 @@ alu_opt_info_is_valid(opt_ctx& ctx, alu_opt_info& info) info.operands[2].neg[0] = true; break; case aco_opcode::v_fma_f32: - // TODO remove precise, not clear why unfusing fma would be valid - if (!ctx.program->dev.fused_mad_mix && info.defs[0].isPrecise()) + if (!ctx.program->dev.fused_mad_mix) return false; break; case aco_opcode::v_mad_f32: @@ -4370,7 +4369,7 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::v_subrev_f32: case aco_opcode::v_mul_f32: return !instr->isSDWA() && !instr->isDPP(); case aco_opcode::v_fma_f32: - return ctx.program->dev.fused_mad_mix || !instr->definitions[0].isPrecise(); + return ctx.program->dev.fused_mad_mix; case aco_opcode::v_fma_mix_f32: case aco_opcode::v_fma_mixlo_f16: return true; default: return false; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 04ba4512680..93b95684361 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -1050,9 +1050,11 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic) //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16) //! p_unit_test 3, %res3 - writeout(3, fma(a, a, f2f32(a16))); + writeout(3, fadd(fmul(a, a), f2f32(a16))); - //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16) + //~gfx9! v1: %tmp4 = v_cvt_f32_f16 %a16 + //~gfx9! v1: %res4 = v_fma_f32 %a, %a, %tmp4 + //~gfx10! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16) //! p_unit_test 4, %res4 writeout(4, fma(a, a, f2f32(a16))); @@ -1239,7 +1241,9 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic) //! p_unit_test 1, %res1 writeout(1, f2f16(fadd(a, b))); - //! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c + //~gfx9! v1: %tmp2 = v_fma_f32 %a, %b, %c + //~gfx9! v2b: %res2 = v_cvt_f16_f32 %tmp2 + //~gfx10! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c //! p_unit_test 2, %res2 writeout(2, f2f16(fma(a, b, c))); @@ -1253,7 +1257,11 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic) //! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c //! p_unit_test 5, %res5 - writeout(5, f2f16(fma(a, f2f32(b16), c))); + writeout(5, f2f16(fadd(fmul(a, f2f32(b16)), c))); + + //! v2b: %res6 = v_fma_mixlo_f16 %a, %b, %c + //! p_unit_test 6, %res6 + writeout(6, f2f16(fadd(fmul(a, b), c))); finish_opt_test(); }