From f99cab1c03399bddca75eb8441455eba342ee31a Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Sat, 13 Feb 2021 14:11:58 -0800
Subject: [PATCH] intel/compiler: Use CMPN for min / max on Gen4 and Gen5

On Intel platforms before Gen6, there is no min or max instruction.
Instead, a comparison instruction (*more on this below) and a SEL
instruction are used.  Per other IEEE rules, the regular comparison
instruction, CMP, will always return false if either source is NaN.  A
sequence like

    cmp.l.f0.0(16)  null<1>F        g30<8,8,1>F     g22<8,8,1>F
    (+f0.0) sel(16) g8<1>F          g30<8,8,1>F     g22<8,8,1>F

will generate the wrong result for min if g22 is NaN.  The CMP will
return false, and the SEL will pick g22.

To account for this, the hardware has a special comparison instruction
CMPN.  This instruction behaves just like CMP, except if the second
source is NaN, it will return true.  The intention is to use it for min
and max.  This sequence will always generate the correct result:

    cmpn.l.f0.0(16) null<1>F        g30<8,8,1>F     g22<8,8,1>F
    (+f0.0) sel(16) g8<1>F          g30<8,8,1>F     g22<8,8,1>F

The problem is... for whatever reason, we don't emit CMPN.  There was
even a comment in lower_minmax that calls out this very issue!  The bug
is actually older than the "Fixes" below even implies.  That's just when
the comment was added.  That we know of, we never observed a failure
until #4254.

If src1 is known to be a number, either because it's not float or it's
an immediate number, use CMP.  This allows cmod propagation to still do
its thing.  Without this slight optimization, about 8,300 shaders from
shader-db are hurt on Iron Lake.

Fixes the following piglit tests (from piglit!475):

    tests/spec/glsl-1.20/execution/fs-nan-builtin-max.shader_test
    tests/spec/glsl-1.20/execution/fs-nan-builtin-min.shader_test
    tests/spec/glsl-1.20/execution/vs-nan-builtin-max.shader_test
    tests/spec/glsl-1.20/execution/vs-nan-builtin-min.shader_test

Closes: #4254
Fixes: 2f2c00c7279 ("i965: Lower min/max after optimization on Gen4/5.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>

Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8115134 -> 8115135 (<.01%)
instructions in affected programs: 229 -> 230 (0.44%)
helped: 0

HURT: 1
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9027>
(cherry picked from commit 3c31364f5e7d34fdc977de20808bbb361f77184e)
---
 .pick_status.json               |  2 +-
 src/intel/compiler/brw_fs.cpp   | 16 ++++++++++++----
 src/intel/compiler/brw_vec4.cpp | 16 ++++++++++++----
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index a4c23b33da0..3919c50e767 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -2956,7 +2956,7 @@
         "description": "intel/compiler: Use CMPN for min / max on Gen4 and Gen5",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": "2f2c00c7279e7c43e520e21de1781f8cec263e92"
     },
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1c9e471f5de..dfd8e318756 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4173,11 +4173,19 @@ fs_visitor::lower_minmax()
 
       if (inst->opcode == BRW_OPCODE_SEL &&
           inst->predicate == BRW_PREDICATE_NONE) {
-         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
-          *        the original SEL.L/GE instruction
+         /* If src1 is an immediate value that is not NaN, then it can't be
+          * NaN.  In that case, emit CMP because it is much better for cmod
+          * propagation.  Likewise if src1 is not float.  Gen4 and Gen5 don't
+          * support HF or DF, so it is not necessary to check for those.
           */
-         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
-                  inst->conditional_mod);
+         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     inst->conditional_mod);
+         } else {
+            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                      inst->conditional_mod);
+         }
          inst->predicate = BRW_PREDICATE_NORMAL;
          inst->conditional_mod = BRW_CONDITIONAL_NONE;
 
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 75bdd5934fb..ef115c741aa 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -1869,11 +1869,19 @@ vec4_visitor::lower_minmax()
 
       if (inst->opcode == BRW_OPCODE_SEL &&
           inst->predicate == BRW_PREDICATE_NONE) {
-         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
-          *        the original SEL.L/GE instruction
+         /* If src1 is an immediate value that is not NaN, then it can't be
+          * NaN.  In that case, emit CMP because it is much better for cmod
+          * propagation.  Likewise if src1 is not float.  Gen4 and Gen5 don't
+          * support HF or DF, so it is not necessary to check for those.
           */
-         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
-                  inst->conditional_mod);
+         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
+            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                     inst->conditional_mod);
+         } else {
+            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                      inst->conditional_mod);
+         }
          inst->predicate = BRW_PREDICATE_NORMAL;
          inst->conditional_mod = BRW_CONDITIONAL_NONE;