radeonsi: add radeonsi_force_use_fma32 driconf option

fma32 only round once so has 0.5UP accuracy. mad32 round twice so has 1UP accuracy. This accuracy difference sometimes make the result different at the last bit. Applications like META need more accuracy for display right result. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Signed-off-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13686>
2026-05-01 23:18:20 +02:00 · 2021-11-05 10:05:09 +08:00 · 2021-11-05 10:05:09 +08:00 · 3900551894
commit 3900551894
parent a0634a3c85
2 changed files with 7 additions and 2 deletions
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@ -15,6 +15,7 @@ OPT_BOOL(enable_sam, false, "Enable Smart Access Memory with Above 4G Decoding f
 OPT_BOOL(disable_sam, false, "Disable Smart Access Memory.")
 OPT_BOOL(fp16, false, "Enable FP16 for mediump.")
 OPT_INT(tc_max_cpu_storage_size, 0, "Enable the CPU storage for pipelined buffer uploads in TC.")
+OPT_BOOL(force_use_fma32, false, "Force use fma32 instruction for GPU family newer than gfx9")

 #undef OPT_BOOL
 #undef OPT_INT
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@ -998,6 +998,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen)

   si_init_renderer_string(sscreen);

+   /* fma32 is too slow for gpu < gfx9, so force it only when gpu >= gfx9 */
+   bool force_fma32 =
+      sscreen->info.chip_class >= GFX9 && sscreen->options.force_use_fma32;
+
   const struct nir_shader_compiler_options nir_options = {
      .lower_scmp = true,
      .lower_flrp16 = true,
@ -1026,10 +1030,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
       * gfx10 and older prefer MAD for F32 because of the legacy instruction.
       */
      .lower_ffma16 = sscreen->info.chip_class < GFX9,
-      .lower_ffma32 = sscreen->info.chip_class < GFX10_3,
+      .lower_ffma32 = sscreen->info.chip_class < GFX10_3 && !force_fma32,
      .lower_ffma64 = false,
      .fuse_ffma16 = sscreen->info.chip_class >= GFX9,
-      .fuse_ffma32 = sscreen->info.chip_class >= GFX10_3,
+      .fuse_ffma32 = sscreen->info.chip_class >= GFX10_3 || force_fma32,
      .fuse_ffma64 = true,
      .lower_fmod = true,
      .lower_pack_snorm_4x8 = true,