radeonsi: add radeonsi_force_use_fma32 driconf option

fma32 only round once so has 0.5UP accuracy. mad32 round twice so
has 1UP accuracy. This accuracy difference sometimes make the result
different at the last bit.

Applications like META need more accuracy for display right result.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13686>
This commit is contained in:
Qiang Yu 2021-11-05 10:05:09 +08:00 committed by Marge Bot
parent a0634a3c85
commit 3900551894
2 changed files with 7 additions and 2 deletions

View file

@ -15,6 +15,7 @@ OPT_BOOL(enable_sam, false, "Enable Smart Access Memory with Above 4G Decoding f
OPT_BOOL(disable_sam, false, "Disable Smart Access Memory.")
OPT_BOOL(fp16, false, "Enable FP16 for mediump.")
OPT_INT(tc_max_cpu_storage_size, 0, "Enable the CPU storage for pipelined buffer uploads in TC.")
OPT_BOOL(force_use_fma32, false, "Force use fma32 instruction for GPU family newer than gfx9")
#undef OPT_BOOL
#undef OPT_INT

View file

@ -998,6 +998,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
si_init_renderer_string(sscreen);
/* fma32 is too slow for gpu < gfx9, so force it only when gpu >= gfx9 */
bool force_fma32 =
sscreen->info.chip_class >= GFX9 && sscreen->options.force_use_fma32;
const struct nir_shader_compiler_options nir_options = {
.lower_scmp = true,
.lower_flrp16 = true,
@ -1026,10 +1030,10 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
* gfx10 and older prefer MAD for F32 because of the legacy instruction.
*/
.lower_ffma16 = sscreen->info.chip_class < GFX9,
.lower_ffma32 = sscreen->info.chip_class < GFX10_3,
.lower_ffma32 = sscreen->info.chip_class < GFX10_3 && !force_fma32,
.lower_ffma64 = false,
.fuse_ffma16 = sscreen->info.chip_class >= GFX9,
.fuse_ffma32 = sscreen->info.chip_class >= GFX10_3,
.fuse_ffma32 = sscreen->info.chip_class >= GFX10_3 || force_fma32,
.fuse_ffma64 = true,
.lower_fmod = true,
.lower_pack_snorm_4x8 = true,