From 97e3c6a12a2c981bf70e64fbd4ab4e7bbf8601eb Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Fri, 3 May 2024 11:07:39 -0700
Subject: [PATCH] intel/brw: Use range analysis to optimize fsign

shader-db:

Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19674784 -> 19665960 (-0.04%)
instructions in affected programs: 933425 -> 924601 (-0.95%)
helped: 3656 / HURT: 0

total cycles in shared programs: 810343919 -> 810241030 (-0.01%)
cycles in affected programs: 56752034 -> 56649145 (-0.18%)
helped: 3032 / HURT: 434

LOST:   11
GAINED: 0

Ice Lake and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 20315795 -> 20305856 (-0.05%)
instructions in affected programs: 979698 -> 969759 (-1.01%)
helped: 3845 / HURT: 0

total cycles in shared programs: 830600281 -> 830534694 (<.01%)
cycles in affected programs: 45675615 -> 45610028 (-0.14%)
helped: 3250 / HURT: 325

total spills in shared programs: 4583 -> 4565 (-0.39%)
spills in affected programs: 180 -> 162 (-10.00%)
helped: 3 / HURT: 0

total fills in shared programs: 5245 -> 5219 (-0.50%)
fills in affected programs: 379 -> 353 (-6.86%)
helped: 3 / HURT: 0

LOST:   14
GAINED: 8

fossil-db:

All Intel platforms except Tiger Lake had similar results. (Meteor Lake shown)
Totals:
Instrs: 154024263 -> 154023814 (-0.00%)
Cycle count: 17463341602 -> 17461726239 (-0.01%); split: -0.01%, +0.00%

Totals from 322 (0.05% of 631440) affected shaders:
Instrs: 199933 -> 199484 (-0.22%)
Cycle count: 168492537 -> 166877174 (-0.96%); split: -0.96%, +0.00%

Tiger Lake
Instrs: 149984723 -> 149984287 (-0.00%)
Cycle count: 15238596937 -> 15239260415 (+0.00%); split: -0.00%, +0.01%
Max dispatch width: 5553408 -> 5553424 (+0.00%)

Totals from 318 (0.05% of 631414) affected shaders:
Instrs: 179624 -> 179188 (-0.24%)
Cycle count: 160724533 -> 161388011 (+0.41%); split: -0.06%, +0.48%
Max dispatch width: 3296 -> 3312 (+0.49%)

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29095>
---
 src/intel/compiler/brw_nir_lower_fsign.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/intel/compiler/brw_nir_lower_fsign.py b/src/intel/compiler/brw_nir_lower_fsign.py
index 6821b3d98f5..04c6a2bde68 100644
--- a/src/intel/compiler/brw_nir_lower_fsign.py
+++ b/src/intel/compiler/brw_nir_lower_fsign.py
@@ -16,6 +16,16 @@ lower_fsign = [
     # is_finite.
     #
     # NOTE: fcsel opcodes are currently limited to float32 in NIR.
+    (('fmul@32(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('fcsel_gt',          a ,          b , ('fmul', b, 0.0       ))),
+    (('~fmul@32',                       ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('fcsel_gt',          a ,          b ,             0.0        )),
+    (('fmul@32(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('fcsel_gt', ('fneg', a), ('fneg', b), ('fmul', b, 0x80000000))),
+    (('~fmul@32',                       ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('fcsel_gt', ('fneg', a), ('fneg', b),             0x80000000 )),
+
+    (('fmul@16(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('bcsel', ('!flt', 0,          a ),          b , ('fmul', b, 0.0   ))),
+    (('~fmul@16',                       ('fsign(is_used_once)', 'a(is_not_negative)'), b), ('bcsel', ('!flt', 0,          a ),          b ,             0.0    )),
+    (('fmul@16(is_only_used_as_float)', ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('bcsel', ('!flt', 0, ('fneg', a)), ('fneg', b), ('fmul', b, 0x8000))),
+    (('~fmul@16',                       ('fsign(is_used_once)', 'a(is_not_positive)'), b), ('bcsel', ('!flt', 0, ('fneg', a)), ('fneg', b),             0x8000 )),
+
     (('fmul@32(is_only_used_as_float,nsz)',      ('fsign(is_used_once)', a), 'b(is_finite)'), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))),
     (('fmul@32(is_only_used_as_float,nsz,nnan)', ('fsign(is_used_once)', a),  b            ), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))),
     (('~fmul@32',                                ('fsign(is_used_once)', a),  b            ), ('fcsel_gt', a, b, ('fcsel_gt', ('fneg', a), ('fneg', b), 0.0))),
@@ -25,6 +35,12 @@ lower_fsign = [
     # only slight deviation is that it can provide -0 for some NaN inputs.
     (('fsign@32', a), ('fcsel_gt',          ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))),
     (('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000    ), 0x3c00    ), ('iand', a, 0x8000    ))),
+
+    # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
+    (('fmul(nsz,nnan)', 'a', 0.0), 0.0),
+    (('fmul(nsz)', 'a(is_finite)', 0.0), 0.0),
+    (('fmul(nsz,nnan)', 'a@32', 0x80000000), 0.0),
+    (('fmul(nsz,nnan)', 'a@16', 0x8000    ), 0.0),
 ]
 
 def main():