diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c index 3fa0cc88334..96c83fa8a71 100644 --- a/src/amd/common/ac_binary.c +++ b/src/amd/common/ac_binary.c @@ -138,6 +138,6 @@ void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wav * - denormals break v_mad_f32 * - GFX6 & GFX7 would be very slow */ - conf->float_mode &= ~V_00B028_FP_ALL_DENORMS; - conf->float_mode |= V_00B028_FP_64_DENORMS; + conf->float_mode &= ~V_00B028_FP_32_DENORMS; + conf->float_mode |= V_00B028_FP_16_64_DENORMS; } diff --git a/src/amd/registers/gfx10.json b/src/amd/registers/gfx10.json index 53edb48c7af..5bbaf866800 100644 --- a/src/amd/registers/gfx10.json +++ b/src/amd/registers/gfx10.json @@ -295,9 +295,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx103.json b/src/amd/registers/gfx103.json index 4a83aff4eca..b2c02908ae7 100644 --- a/src/amd/registers/gfx103.json +++ b/src/amd/registers/gfx103.json @@ -249,9 +249,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx11.json b/src/amd/registers/gfx11.json index 334c1b1b29e..38308f06641 100644 --- a/src/amd/registers/gfx11.json +++ b/src/amd/registers/gfx11.json @@ -229,9 +229,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx6.json b/src/amd/registers/gfx6.json index 0384b5b72d7..a9ea8a6c28c 100644 --- a/src/amd/registers/gfx6.json +++ b/src/amd/registers/gfx6.json @@ -233,9 +233,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx7.json b/src/amd/registers/gfx7.json index 2c29aac5290..556e893f3e8 100644 --- a/src/amd/registers/gfx7.json +++ b/src/amd/registers/gfx7.json @@ -249,9 +249,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx8.json b/src/amd/registers/gfx8.json index e0c4eab5787..1ad04509ba7 100644 --- a/src/amd/registers/gfx8.json +++ b/src/amd/registers/gfx8.json @@ -270,9 +270,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx81.json b/src/amd/registers/gfx81.json index 667e8fa9f37..c8c834d37fa 100644 --- a/src/amd/registers/gfx81.json +++ b/src/amd/registers/gfx81.json @@ -277,9 +277,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/gfx9.json b/src/amd/registers/gfx9.json index 8ce72415ddb..6d0f3c9d14c 100644 --- a/src/amd/registers/gfx9.json +++ b/src/amd/registers/gfx9.json @@ -310,9 +310,10 @@ }, "FLOAT_MODE": { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192} ] }, "ForceControl": { diff --git a/src/amd/registers/parse_kernel_headers.py b/src/amd/registers/parse_kernel_headers.py index 4bae19597ff..67883f40975 100644 --- a/src/amd/registers/parse_kernel_headers.py +++ b/src/amd/registers/parse_kernel_headers.py @@ -418,9 +418,10 @@ VRSHtileEncoding = { missing_enums_all = { 'FLOAT_MODE': { "entries": [ + {"name": "FP_32_ROUND_TOWARDS_ZERO", "value": 3}, + {"name": "FP_16_64_ROUND_TOWARDS_ZERO", "value": 12}, {"name": "FP_32_DENORMS", "value": 48}, - {"name": "FP_64_DENORMS", "value": 192}, - {"name": "FP_ALL_DENORMS", "value": 240} + {"name": "FP_16_64_DENORMS", "value": 192}, ] }, 'QUANT_MODE': { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 531fc60ca95..63cafa8ef04 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1839,6 +1839,33 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi shader->info.uses_instanceid = sel->info.uses_instanceid; shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4); + /* Set the FP ALU behavior. */ + /* By default, we disable denormals for FP32 and enable them for FP16 and FP64 + * for performance and correctness reasons. FP32 denormals can't be enabled because + * they break output modifiers and v_mad_f32 and are very slow on GFX6-7. + * + * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags + * can be set simultaneously, which means we are allowed to choose, but not really because + * some options cause GLCTS failures. + */ + unsigned float_mode = V_00B028_FP_16_64_DENORMS; + + if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) && + nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) + float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO; + + if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) && + nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | + FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) + float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO; + + if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 | + FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) && + nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)) + float_mode &= ~V_00B028_FP_16_64_DENORMS; + /* TODO: ACO could compile non-monolithic shaders here (starting * with PS and NGG VS), but monolithic shaders should be compiled * by LLVM due to more complicated compilation. @@ -1846,6 +1873,8 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (!si_llvm_compile_shader(sscreen, compiler, shader, &so, debug, nir, free_nir)) return false; + shader->config.float_mode = float_mode; + /* The GS copy shader is compiled next. */ if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, &so, debug);