diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index d227358322d..b50314dec47 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -853,10 +853,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) } break; case nir_op_ffma: - /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */ - result = - emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd", - ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]); + /* FMA is slow on gfx6-8, so it shouldn't be used. */ + assert(ctx->ac.chip_class >= GFX9); + result = emit_intrin_3f_param(&ctx->ac, "llvm.fma", ac_to_float_type(&ctx->ac, def_type), + src[0], src[1], src[2]); break; case nir_op_ldexp: src[0] = ac_to_float(&ctx->ac, src[0]); diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 059bd8d00e8..e7479012ea9 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -937,7 +937,16 @@ void si_init_screen_get_functions(struct si_screen *sscreen) .lower_bitfield_insert_to_bitfield_select = true, .lower_bitfield_extract = true, .lower_sub = true, - .fuse_ffma = true, + /* gfx6-8: use MAD (FMA is 4x slower) + * gfx9-10: either is OK (MAD and FMA have the same performance) + * gfx10.3: use FMA (MAD doesn't exist, separate MUL+ADD are 2x slower) + * + * FMA has no advantage on gfx9-10 and MAD allows more algebraic optimizations. + * Keep FMA enabled on gfx10 to test it, which helps us validate correctness + * for gfx10.3 on gfx10. + */ + .lower_ffma = sscreen->info.chip_class <= GFX9, + .fuse_ffma = sscreen->info.chip_class >= GFX10, .lower_fmod = true, .lower_pack_snorm_4x8 = true, .lower_pack_unorm_4x8 = true,