From 3f151c03af3d159c8725f5f6dc7050b6a418de51 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 1 May 2024 21:02:13 -0700
Subject: [PATCH] intel/brw: Handle fsign optimization in a NIR algebraic pass

This is a lot less code, and it makes it easier to experiment with other
pattern-based optimizations in the future.

The results here are nearly identical to the results I got from Ken's
"intel/brw: Make fsign (for 16/32-bit) in SSA form"... which are not
particularly good.

In this commit and in Ken's, all of the shader-db shaders hurt for
spills and fills are from Deus Ex Mankind Divided. Each shader has a
bunch of texture instructions with a single fsign between the
blocks. With the dependency on the flag removed, the scheduler puts all
of the texture instructions at the start... and there are a LOT of them.

shader-db:

All Intel platforms had similar results. (Meteor Lake shown)
total instructions in shared programs: 19647060 -> 19650207 (0.02%)
instructions in affected programs: 734718 -> 737865 (0.43%)
helped: 382 / HURT: 1984

total cycles in shared programs: 823238442 -> 822785913 (-0.05%)
cycles in affected programs: 426901157 -> 426448628 (-0.11%)
helped: 3408 / HURT: 3671

total spills in shared programs: 3887 -> 3891 (0.10%)
spills in affected programs: 256 -> 260 (1.56%)
helped: 0 / HURT: 4

total fills in shared programs: 3236 -> 3306 (2.16%)
fills in affected programs: 882 -> 952 (7.94%)
helped: 0 / HURT: 12

LOST:   37
GAINED: 34

fossil-db:

DG2 and Meteor Lake had similar results. (Meteor Lake shown)
Totals:
Instrs: 154005469 -> 154008294 (+0.00%); split: -0.00%, +0.00%
Cycle count: 17551859277 -> 17554293955 (+0.01%); split: -0.02%, +0.04%
Spill count: 142078 -> 142090 (+0.01%)
Fill count: 266761 -> 266729 (-0.01%); split: -0.02%, +0.01%
Max live registers: 32593578 -> 32593858 (+0.00%)
Max dispatch width: 5535944 -> 5536816 (+0.02%); split: +0.02%, -0.01%

Totals from 5867 (0.93% of 631350) affected shaders:
Instrs: 5475544 -> 5478369 (+0.05%); split: -0.04%, +0.09%
Cycle count: 1649032029 -> 1651466707 (+0.15%); split: -0.24%, +0.39%
Spill count: 26411 -> 26423 (+0.05%)
Fill count: 57364 -> 57332 (-0.06%); split: -0.10%, +0.04%
Max live registers: 431561 -> 431841 (+0.06%)
Max dispatch width: 49784 -> 50656 (+1.75%); split: +2.38%, -0.63%

Tiger Lake
Totals:
Instrs: 149530671 -> 149533588 (+0.00%); split: -0.00%, +0.00%
Cycle count: 15261418953 -> 15264764921 (+0.02%); split: -0.00%, +0.03%
Spill count: 60317 -> 60316 (-0.00%); split: -0.02%, +0.01%
Max live registers: 32249201 -> 32249464 (+0.00%)
Max dispatch width: 5540608 -> 5540584 (-0.00%)

Totals from 5862 (0.93% of 630309) affected shaders:
Instrs: 4740800 -> 4743717 (+0.06%); split: -0.04%, +0.10%
Cycle count: 566531248 -> 569877216 (+0.59%); split: -0.13%, +0.72%
Spill count: 11709 -> 11708 (-0.01%); split: -0.09%, +0.08%
Max live registers: 424560 -> 424823 (+0.06%)
Max dispatch width: 50304 -> 50280 (-0.05%)

Ice Lake
Totals:
Instrs: 150499705 -> 150502608 (+0.00%); split: -0.00%, +0.00%
Cycle count: 15105629116 -> 15105425880 (-0.00%); split: -0.00%, +0.00%
Spill count: 60087 -> 60090 (+0.00%)
Fill count: 100542 -> 100541 (-0.00%); split: -0.00%, +0.00%
Max live registers: 32605215 -> 32605495 (+0.00%)
Max dispatch width: 5617752 -> 5617792 (+0.00%); split: +0.00%, -0.00%

Totals from 5882 (0.93% of 634934) affected shaders:
Instrs: 4737206 -> 4740109 (+0.06%); split: -0.04%, +0.10%
Cycle count: 598882104 -> 598678868 (-0.03%); split: -0.08%, +0.05%
Spill count: 10278 -> 10281 (+0.03%)
Fill count: 22504 -> 22503 (-0.00%); split: -0.01%, +0.01%
Max live registers: 424184 -> 424464 (+0.07%)
Max dispatch width: 50216 -> 50256 (+0.08%); split: +0.25%, -0.18%

Skylake
Totals:
Instrs: 139092612 -> 139095257 (+0.00%); split: -0.00%, +0.00%
Cycle count: 14533550285 -> 14533544716 (-0.00%); split: -0.00%, +0.00%
Spill count: 58176 -> 58172 (-0.01%)
Fill count: 95877 -> 95796 (-0.08%)
Max live registers: 31924594 -> 31924874 (+0.00%)
Max dispatch width: 5484568 -> 5484552 (-0.00%); split: +0.00%, -0.00%

Totals from 5789 (0.93% of 625512) affected shaders:
Instrs: 4481987 -> 4484632 (+0.06%); split: -0.04%, +0.10%
Cycle count: 578310124 -> 578304555 (-0.00%); split: -0.05%, +0.05%
Spill count: 9248 -> 9244 (-0.04%)
Fill count: 19677 -> 19596 (-0.41%)
Max live registers: 415340 -> 415620 (+0.07%)
Max dispatch width: 49720 -> 49704 (-0.03%); split: +0.10%, -0.13%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29095>
---
 src/intel/compiler/brw_fs_nir.cpp         | 143 +---------------------
 src/intel/compiler/brw_nir.c              |   6 +-
 src/intel/compiler/brw_nir.h              |   2 +
 src/intel/compiler/brw_nir_lower_fsign.py |  53 ++++++++
 src/intel/compiler/meson.build            |  13 +-
 5 files changed, 73 insertions(+), 144 deletions(-)
 create mode 100644 src/intel/compiler/brw_nir_lower_fsign.py

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 742d4314beb..dad87ccb80f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -849,135 +849,6 @@ try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
    return true;
 }
 
-/**
- * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
- *
- * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
- * the source of \c instr that is a \c nir_op_fsign.
- */
-static void
-emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
-           fs_reg result, fs_reg *op, unsigned fsign_src)
-{
-   const intel_device_info *devinfo = ntb.devinfo;
-
-   fs_inst *inst;
-
-   assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
-   assert(fsign_src < nir_op_infos[instr->op].num_inputs);
-
-   if (instr->op != nir_op_fsign) {
-      const nir_alu_instr *const fsign_instr =
-         nir_src_as_alu_instr(instr->src[fsign_src].src);
-
-      /* op[fsign_src] has the nominal result of the fsign, and op[1 -
-       * fsign_src] has the other multiply source.  This must be rearranged so
-       * that op[0] is the source of the fsign op[1] is the other multiply
-       * source.
-       */
-      if (fsign_src != 0)
-         op[1] = op[0];
-
-      op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
-
-      const nir_alu_type t =
-         (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
-                        nir_src_bit_size(fsign_instr->src[0].src));
-
-      op[0].type = brw_type_for_nir_type(devinfo, t);
-
-      unsigned channel = 0;
-      if (nir_op_infos[instr->op].output_size == 0) {
-         /* Since NIR is doing the scalarizing for us, we should only ever see
-          * vectorized operations with a single channel.
-          */
-         nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
-         assert(util_bitcount(write_mask) == 1);
-         channel = ffs(write_mask) - 1;
-      }
-
-      op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
-   }
-
-   if (brw_type_size_bytes(op[0].type) == 2) {
-      /* AND(val, 0x8000) gives the sign bit.
-       *
-       * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
-       */
-      fs_reg zero = retype(brw_imm_uw(0), BRW_TYPE_HF);
-      bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
-
-      op[0].type = BRW_TYPE_UW;
-      result.type = BRW_TYPE_UW;
-      bld.AND(result, op[0], brw_imm_uw(0x8000u));
-
-      if (instr->op == nir_op_fsign)
-         inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
-      else {
-         /* Use XOR here to get the result sign correct. */
-         inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UW));
-      }
-
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   } else if (brw_type_size_bytes(op[0].type) == 4) {
-      /* AND(val, 0x80000000) gives the sign bit.
-       *
-       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
-       * zero.
-       */
-      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
-
-      op[0].type = BRW_TYPE_UD;
-      result.type = BRW_TYPE_UD;
-      bld.AND(result, op[0], brw_imm_ud(0x80000000u));
-
-      if (instr->op == nir_op_fsign)
-         inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
-      else {
-         /* Use XOR here to get the result sign correct. */
-         inst = bld.XOR(result, result, retype(op[1], BRW_TYPE_UD));
-      }
-
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   } else {
-      unreachable("Should have been lowered by nir_opt_algebraic.");
-   }
-}
-
-/**
- * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
- *
- * Checks the operands of a \c nir_op_fmul to determine whether or not
- * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
- *
- * \param instr  The multiplication instruction
- *
- * \param fsign_src The source of \c instr that may or may not be a
- *                  \c nir_op_fsign
- */
-static bool
-can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
-{
-   assert(instr->op == nir_op_fmul);
-
-   nir_alu_instr *const fsign_instr =
-      nir_src_as_alu_instr(instr->src[fsign_src].src);
-
-   /* Rules:
-    *
-    * 1. instr->src[fsign_src] must be a nir_op_fsign.
-    * 2. The nir_op_fsign can only be used by this multiplication.
-    * 3. The source that is the nir_op_fsign does not have source modifiers.
-    *    \c emit_fsign only examines the source modifiers of the source of the
-    *    \c nir_op_fsign.
-    *
-    * The nir_op_fsign must also not have the saturate modifier, but steps
-    * have already been taken (in nir_opt_algebraic) to ensure that.
-    */
-   return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
-          is_used_once(fsign_instr);
-}
-
 static bool
 is_const_zero(const nir_src &src)
 {
@@ -1233,8 +1104,7 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
       break;
 
    case nir_op_fsign:
-      emit_fsign(ntb, bld, instr, result, op, 0);
-      break;
+      unreachable("Should have been lowered by brw_nir_lower_fsign.");
 
    case nir_op_frcp:
       bld.RCP(result, op[0]);
@@ -1322,17 +1192,6 @@ fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
    }
 
    case nir_op_fmul:
-      for (unsigned i = 0; i < 2; i++) {
-         if (can_fuse_fmul_fsign(instr, i)) {
-            emit_fsign(ntb, bld, instr, result, op, i);
-            return;
-         }
-      }
-
-      /* We emit the rounding mode after the previous fsign optimization since
-       * it won't result in a MUL, but will try to negate the value by other
-       * means.
-       */
       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
          brw_rnd_mode rnd =
             brw_rnd_mode_from_execution_mode(execution_mode);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 87b06d4c154..3f025097c26 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -1711,7 +1711,11 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 
    do {
       progress = false;
-      if (OPT(nir_opt_algebraic_late)) {
+
+      OPT(nir_opt_algebraic_late);
+      OPT(brw_nir_lower_fsign);
+
+      if (progress) {
          OPT(nir_opt_constant_folding);
          OPT(nir_copy_prop);
          OPT(nir_opt_dce);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 744ba46a360..561afb8bee6 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -195,6 +195,8 @@ bool brw_nir_apply_trig_workarounds(nir_shader *nir);
 
 bool brw_nir_limit_trig_input_range_workaround(nir_shader *nir);
 
+bool brw_nir_lower_fsign(nir_shader *nir);
+
 void brw_nir_apply_key(nir_shader *nir,
                        const struct brw_compiler *compiler,
                        const struct brw_base_prog_key *key,
diff --git a/src/intel/compiler/brw_nir_lower_fsign.py b/src/intel/compiler/brw_nir_lower_fsign.py
new file mode 100644
index 00000000000..3df2441fe77
--- /dev/null
+++ b/src/intel/compiler/brw_nir_lower_fsign.py
@@ -0,0 +1,53 @@
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+import argparse
+import sys
+from math import pi
+
+a = 'a'
+b = 'b'
+
+lower_fsign = [
+    # This matches the behavior of the old optimization in brw_fs_nir.cpp, but
+    # it has some problems.
+    #
+    # The fmul version passes Vulkan float_controls2 CTS a little bit by
+    # luck. The use of fne means that the false path (i.e., fsign(X) == 0) is
+    # only taken when X is zero. For OpenCL, this path should also be taken
+    # when when X is NaN. This can be handled by using 'fabs(X) > 0', but this
+    # fails float_controls2 CTS when the other multiplication operand is NaN.
+    #
+    # This optimization is additionally problematic when fsign(X) is zero and
+    # the other multiplication operand is Inf. This will result in 0, but it
+    # should result in NaN. This does not seem to be tested by the CTS.
+    #
+    # NOTE: fcsel opcodes are currently limited to float32 in NIR.
+    (('fmul@32', ('fsign(is_used_once)', a), b), ('fcsel',          a    , ('ixor', ('iand', a, 0x80000000), b), ('iand', a, 0x80000000))),
+    (('fmul@16', ('fsign(is_used_once)', a), b), ('bcsel', ('fneu', a, 0), ('ixor', ('iand', a, 0x8000    ), b), ('iand', a, 0x8000    ))),
+
+    # This is 99.99% strictly correct for OpenCL. It will provide correctly
+    # signed zero for ±0 inputs, and it will provide zero for NaN inputs. The
+    # only slight deviation is that it can provide -0 for some NaN inputs.
+    (('fsign@32', a), ('fcsel_gt',          ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))),
+    (('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000    ), 0x3c00    ), ('iand', a, 0x8000    ))),
+]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "brw_nir.h"')
+
+    print(nir_algebraic.AlgebraicPass("brw_nir_lower_fsign", lower_fsign).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 1ad4f95e811..74237fbdef1 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -130,6 +130,17 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
                                         command : [prog_python, '@INPUT0@', '--outdir', meson.current_build_dir()])
 
 
+brw_nir_lower_fsign = custom_target(
+  'brw_nir_lower_fsign.c',
+  input : 'brw_nir_lower_fsign.py',
+  output : 'brw_nir_lower_fsign.c',
+  command : [
+    prog_python, '@INPUT@', '-p', dir_compiler_nir,
+  ],
+  depend_files : nir_algebraic_depends,
+  capture : true,
+)
+
 brw_nir_trig = custom_target(
   'brw_nir_trig_workarounds.c',
   input : 'brw_nir_trig_workarounds.py',
@@ -143,7 +154,7 @@ brw_nir_trig = custom_target(
 
 libintel_compiler_brw = static_library(
   'intel_compiler',
-  [libintel_compiler_brw_files, intel_nir_files, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
+  [libintel_compiler_brw_files, intel_nir_files, brw_nir_lower_fsign, brw_nir_trig, ir_expression_operation_h, [brw_device_sha1_gen_src]],
   include_directories : [inc_include, inc_src, inc_intel],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',