From f75c83c4aae2b01013f3740a7414cf207f60b0ab Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 9 Aug 2021 15:05:33 -0700
Subject: [PATCH] nir/loop_analyze: Fix get_iteration for nir_op_fneu

Consider the loop:

    float i = 0.0;
    while (true) {
       if (i != 0.0)
          break;

       i = i + 1.0;
    }

This loop clearly executes exactly one time.

Some trickery is necessary to handle cases where the initial loop value
is very large and the increment is, by comparison, very small.  From the
fenu_once test case,

    float i = -604462909807314587353088.0;
    while (true) {
       if (i != -604462909807314587353088.0)
          break;

       i = i + 36028797018963968.0;
    }

This loop should also execute exactly once, but this is much more
challenging to calculate due to precision issues.

Going towards smaller magnitude (i.e., adding a small positive value to
a large negative value) requires a smaller delta to make a difference
than going towards a larger magnitude. For this reason,
-604462909807314587353088.0 + 36028797018963968.0 !=
-604462909807314587353088.0, but -604462909807314587353088.0 +
-36028797018963968.0 == -604462909807314587353088.0. Math class is
tough.

No changes in shader-db or fossil-db.

v2: Fix major bug in checking result of the eval_const_binop(nir_op_feq,
...) discovered while developing fneu_once_easy unit test. Fix a typo in
the comment just above that. Add fneu_once_easy test.

v3: Skip the iteration count adjustment tests for nir_op_fenu and
nir_op_ine. Since the iteration count is either 1 or unknown, all this
function can do is add numerical error. Add fenu_once tests.

v4: Change the initial value in the fneu_once test from large positive
to large negative. Change check in get_iteration from nir_op_fsub to
nir_op_fadd. Both changes from discussion with M Henning. Also add some
more explanation in fneu_once.

v5: Rename test cases.

Fixes: 6772a17acc8 ("nir: Add a loop analysis pass")
Reviewed-by: Timothy Arceri <tarceri@itsqueeze.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19732>
---
 src/compiler/nir/nir_loop_analyze.c           |  22 ++-
 src/compiler/nir/tests/loop_analyze_tests.cpp | 185 ++++++++++++++++++
 2 files changed, 206 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_loop_analyze.c b/src/compiler/nir/nir_loop_analyze.c
index 4fd192489ef..d9eca46f765 100644
--- a/src/compiler/nir/nir_loop_analyze.c
+++ b/src/compiler/nir/nir_loop_analyze.c
@@ -779,10 +779,27 @@ get_iteration(nir_op cond_op, nir_const_value initial, nir_const_value step,
                               execution_mode);
       break;
 
+   case nir_op_fneu:
+      /* In order for execution to be here, limit must be the same as initial.
+       * Otherwise will_break_on_first_iteration would have returned false.
+       * If step is zero, the loop is infinite.  Otherwise the loop will
+       * execute once.
+       *
+       * This is a little more tricky for floating point since X-Y might still
+       * be X even if Y is not zero.  Instead check that (initial + step) !=
+       * initial.
+       */
+      span = eval_const_binop(nir_op_fadd, bit_size, initial, step,
+                              execution_mode);
+      iter = eval_const_binop(nir_op_feq, bit_size, initial,
+                              span, execution_mode);
+
+      /* return (initial + step) == initial ? -1 : 1 */
+      return iter.b ? -1 : 1;
+
    case nir_op_fge:
    case nir_op_flt:
    case nir_op_feq:
-   case nir_op_fneu:
       span = eval_const_binop(nir_op_fsub, bit_size, limit, initial,
                               execution_mode);
       iter = eval_const_binop(nir_op_fdiv, bit_size, span,
@@ -953,6 +970,9 @@ calculate_iterations(nir_const_value initial, nir_const_value step,
    if (iter_int < 0)
       return -1;
 
+   if (alu_op == nir_op_ine || alu_op == nir_op_fneu)
+      return iter_int;
+
    /* An explanation from the GLSL unrolling pass:
     *
     * Make sure that the calculated number of iterations satisfies the exit
diff --git a/src/compiler/nir/tests/loop_analyze_tests.cpp b/src/compiler/nir/tests/loop_analyze_tests.cpp
index 7ee24b73670..b6e60d20460 100644
--- a/src/compiler/nir/tests/loop_analyze_tests.cpp
+++ b/src/compiler/nir/tests/loop_analyze_tests.cpp
@@ -502,3 +502,188 @@ TEST_F(nir_loop_analyze_test, one_iteration_ieq)
    EXPECT_EQ(1, loop->info->max_trip_count);
    EXPECT_TRUE(loop->info->exact_trip_count_known);
 }
+
+TEST_F(nir_loop_analyze_test, one_iteration_easy_fneu)
+{
+   /* Create IR:
+    *
+    *    float i = 0.0;
+    *    while (true) {
+    *       if (i != 0.0)
+    *          break;
+    *
+    *       i = i + 1.0;
+    *    }
+    */
+   nir_ssa_def *ssa_0 = nir_imm_int(&b, 0x00000000);
+   nir_ssa_def *ssa_1 = nir_imm_int(&b, 0x3f800000);
+
+   nir_phi_instr *const phi = nir_phi_instr_create(b.shader);
+
+   nir_loop *loop = nir_push_loop(&b);
+   {
+      nir_ssa_dest_init(&phi->instr, &phi->dest,
+                        ssa_0->num_components, ssa_0->bit_size,
+                        NULL);
+
+      nir_phi_instr_add_src(phi, ssa_0->parent_instr->block,
+                            nir_src_for_ssa(ssa_0));
+
+      nir_ssa_def *ssa_4 = &phi->dest.ssa;
+      nir_ssa_def *ssa_2 = nir_fneu(&b, ssa_4, ssa_0);
+
+      nir_if *nif = nir_push_if(&b, ssa_2);
+      {
+         nir_jump_instr *jump = nir_jump_instr_create(b.shader, nir_jump_break);
+         nir_builder_instr_insert(&b, &jump->instr);
+      }
+      nir_pop_if(&b, nif);
+
+      nir_ssa_def *ssa_3 = nir_fadd(&b, ssa_4, ssa_1);
+
+      nir_phi_instr_add_src(phi, ssa_3->parent_instr->block,
+                            nir_src_for_ssa(ssa_3));
+   }
+   nir_pop_loop(&b, loop);
+
+   b.cursor = nir_before_block(nir_loop_first_block(loop));
+   nir_builder_instr_insert(&b, &phi->instr);
+
+   /* At this point, we should have:
+    *
+    * impl main {
+    *         block block_0:
+    *         // preds:
+    *         vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
+    *         vec1 32 ssa_1 = load_const (0x3f800000 = 1.000000)
+    *         // succs: block_1
+    *         loop {
+    *                 block block_1:
+    *                 // preds: block_0 block_4
+    *                 vec1 32 ssa_4 = phi block_0: ssa_0, block_4: ssa_3
+    *                 vec1  1 ssa_2 = fneu ssa_4, ssa_0
+    *                 // succs: block_2 block_3
+    *                 if ssa_2 {
+    *                         block block_2:
+    *                         // preds: block_1
+    *                         break
+    *                         // succs: block_5
+    *                 } else {
+    *                         block block_3:
+    *                         // preds: block_1
+    *                         // succs: block_4
+    *                 }
+    *                 block block_4:
+    *                 // preds: block_3
+    *                 vec1 32 ssa_3 = fadd ssa_4, ssa_1
+    *                 // succs: block_1
+    *         }
+    *         block block_5:
+    *         // preds: block_2
+    *         // succs: block_6
+    *         block block_6:
+    * }
+    */
+   nir_validate_shader(b.shader, "input");
+
+   nir_loop_analyze_impl(b.impl, nir_var_all, false);
+
+   ASSERT_NE((void *)0, loop->info);
+   EXPECT_EQ(1, loop->info->max_trip_count);
+   EXPECT_TRUE(loop->info->exact_trip_count_known);
+}
+
+TEST_F(nir_loop_analyze_test, one_iteration_fneu)
+{
+   /* Create IR:
+    *
+    *    float i = uintBitsToFloat(0xe7000000);
+    *    while (true) {
+    *       if (i != uintBitsToFloat(0xe7000000))
+    *          break;
+    *
+    *       i = i + uintBitsToFloat(0x5b000000);
+    *    }
+    *
+    * Going towards smaller magnitude (i.e., adding a small positive value to
+    * a large negative value) requires a smaller delta to make a difference
+    * than going towards a larger magnitude. For this reason, ssa_0 + ssa_1 !=
+    * ssa_0, but ssa_0 - ssa_1 == ssa_0. Math class is tough.
+    */
+   nir_ssa_def *ssa_0 = nir_imm_int(&b, 0xe7000000);
+   nir_ssa_def *ssa_1 = nir_imm_int(&b, 0x5b000000);
+
+   nir_phi_instr *const phi = nir_phi_instr_create(b.shader);
+
+   nir_loop *loop = nir_push_loop(&b);
+   {
+      nir_ssa_dest_init(&phi->instr, &phi->dest,
+                        ssa_0->num_components, ssa_0->bit_size,
+                        NULL);
+
+      nir_phi_instr_add_src(phi, ssa_0->parent_instr->block,
+                            nir_src_for_ssa(ssa_0));
+
+      nir_ssa_def *ssa_4 = &phi->dest.ssa;
+      nir_ssa_def *ssa_2 = nir_fneu(&b, ssa_4, ssa_0);
+
+      nir_if *nif = nir_push_if(&b, ssa_2);
+      {
+         nir_jump_instr *jump = nir_jump_instr_create(b.shader, nir_jump_break);
+         nir_builder_instr_insert(&b, &jump->instr);
+      }
+      nir_pop_if(&b, nif);
+
+      nir_ssa_def *ssa_3 = nir_fadd(&b, ssa_4, ssa_1);
+
+      nir_phi_instr_add_src(phi, ssa_3->parent_instr->block,
+                            nir_src_for_ssa(ssa_3));
+   }
+   nir_pop_loop(&b, loop);
+
+   b.cursor = nir_before_block(nir_loop_first_block(loop));
+   nir_builder_instr_insert(&b, &phi->instr);
+
+   /* At this point, we should have:
+    *
+    * impl main {
+    *         block block_0:
+    *         // preds:
+    *         vec1 32 ssa_0 = load_const (0xe7000000 = -604462909807314587353088.0)
+    *         vec1 32 ssa_1 = load_const (0x5b000000 = 36028797018963968.0)
+    *         // succs: block_1
+    *         loop {
+    *                 block block_1:
+    *                 // preds: block_0 block_4
+    *                 vec1 32 ssa_4 = phi block_0: ssa_0, block_4: ssa_3
+    *                 vec1  1 ssa_2 = fneu ssa_4, ssa_0
+    *                 // succs: block_2 block_3
+    *                 if ssa_2 {
+    *                         block block_2:
+    *                         // preds: block_1
+    *                         break
+    *                         // succs: block_5
+    *                 } else {
+    *                         block block_3:
+    *                         // preds: block_1
+    *                         // succs: block_4
+    *                 }
+    *                 block block_4:
+    *                 // preds: block_3
+    *                 vec1 32 ssa_3 = fadd ssa_4, ssa_1
+    *                 // succs: block_1
+    *         }
+    *         block block_5:
+    *         // preds: block_2
+    *         // succs: block_6
+    *         block block_6:
+    * }
+    */
+   nir_validate_shader(b.shader, "input");
+
+   nir_loop_analyze_impl(b.impl, nir_var_all, false);
+
+   ASSERT_NE((void *)0, loop->info);
+   EXPECT_EQ(1, loop->info->max_trip_count);
+   EXPECT_TRUE(loop->info->exact_trip_count_known);
+}