mesa/src/intel/compiler/brw_nir_lower_fsign.py

# Copyright © 2024 Intel Corporation
# SPDX-License-Identifier: MIT

import argparse
import sys
from math import pi

a = 'a'
b = 'b'

lower_fsign = [
    # This matches the behavior of the old optimization in brw_fs_nir.cpp, but
    # it has some problems.
    #
    # The fmul version passes Vulkan float_controls2 CTS a little bit by
    # luck. The use of fne means that the false path (i.e., fsign(X) == 0) is
    # only taken when X is zero. For OpenCL, this path should also be taken
    # when when X is NaN. This can be handled by using 'fabs(X) > 0', but this
    # fails float_controls2 CTS when the other multiplication operand is NaN.
    #
    # This optimization is additionally problematic when fsign(X) is zero and
    # the other multiplication operand is Inf. This will result in 0, but it
    # should result in NaN. This does not seem to be tested by the CTS.
    #
    # NOTE: fcsel opcodes are currently limited to float32 in NIR.
    (('fmul@32', ('fsign(is_used_once)', a), b), ('fcsel',          a    , ('ixor', ('iand', a, 0x80000000), b), ('iand', a, 0x80000000))),
    (('fmul@16', ('fsign(is_used_once)', a), b), ('bcsel', ('fneu', a, 0), ('ixor', ('iand', a, 0x8000    ), b), ('iand', a, 0x8000    ))),

    # This is 99.99% strictly correct for OpenCL. It will provide correctly
    # signed zero for ±0 inputs, and it will provide zero for NaN inputs. The
    # only slight deviation is that it can provide -0 for some NaN inputs.
    (('fsign@32', a), ('fcsel_gt',          ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))),
    (('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000    ), 0x3c00    ), ('iand', a, 0x8000    ))),
]

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--import-path', required=True)
    args = parser.parse_args()
    sys.path.insert(0, args.import_path)
    run()


def run():
    import nir_algebraic  # pylint: disable=import-error

    print('#include "brw_nir.h"')

    print(nir_algebraic.AlgebraicPass("brw_nir_lower_fsign", lower_fsign).render())


if __name__ == '__main__':
    main()
intel/brw: Handle fsign optimization in a NIR algebraic pass This is a lot less code, and it makes it easier to experiment with other pattern-based optimizations in the future. The results here are nearly identical to the results I got from Ken's "intel/brw: Make fsign (for 16/32-bit) in SSA form"... which are not particularly good. In this commit and in Ken's, all of the shader-db shaders hurt for spills and fills are from Deus Ex Mankind Divided. Each shader has a bunch of texture instructions with a single fsign between the blocks. With the dependency on the flag removed, the scheduler puts all of the texture instructions at the start... and there are a LOT of them. shader-db: All Intel platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19647060 -> 19650207 (0.02%) instructions in affected programs: 734718 -> 737865 (0.43%) helped: 382 / HURT: 1984 total cycles in shared programs: 823238442 -> 822785913 (-0.05%) cycles in affected programs: 426901157 -> 426448628 (-0.11%) helped: 3408 / HURT: 3671 total spills in shared programs: 3887 -> 3891 (0.10%) spills in affected programs: 256 -> 260 (1.56%) helped: 0 / HURT: 4 total fills in shared programs: 3236 -> 3306 (2.16%) fills in affected programs: 882 -> 952 (7.94%) helped: 0 / HURT: 12 LOST: 37 GAINED: 34 fossil-db: DG2 and Meteor Lake had similar results. (Meteor Lake shown) Totals: Instrs: 154005469 -> 154008294 (+0.00%); split: -0.00%, +0.00% Cycle count: 17551859277 -> 17554293955 (+0.01%); split: -0.02%, +0.04% Spill count: 142078 -> 142090 (+0.01%) Fill count: 266761 -> 266729 (-0.01%); split: -0.02%, +0.01% Max live registers: 32593578 -> 32593858 (+0.00%) Max dispatch width: 5535944 -> 5536816 (+0.02%); split: +0.02%, -0.01% Totals from 5867 (0.93% of 631350) affected shaders: Instrs: 5475544 -> 5478369 (+0.05%); split: -0.04%, +0.09% Cycle count: 1649032029 -> 1651466707 (+0.15%); split: -0.24%, +0.39% Spill count: 26411 -> 26423 (+0.05%) Fill count: 57364 -> 57332 (-0.06%); split: -0.10%, +0.04% Max live registers: 431561 -> 431841 (+0.06%) Max dispatch width: 49784 -> 50656 (+1.75%); split: +2.38%, -0.63% Tiger Lake Totals: Instrs: 149530671 -> 149533588 (+0.00%); split: -0.00%, +0.00% Cycle count: 15261418953 -> 15264764921 (+0.02%); split: -0.00%, +0.03% Spill count: 60317 -> 60316 (-0.00%); split: -0.02%, +0.01% Max live registers: 32249201 -> 32249464 (+0.00%) Max dispatch width: 5540608 -> 5540584 (-0.00%) Totals from 5862 (0.93% of 630309) affected shaders: Instrs: 4740800 -> 4743717 (+0.06%); split: -0.04%, +0.10% Cycle count: 566531248 -> 569877216 (+0.59%); split: -0.13%, +0.72% Spill count: 11709 -> 11708 (-0.01%); split: -0.09%, +0.08% Max live registers: 424560 -> 424823 (+0.06%) Max dispatch width: 50304 -> 50280 (-0.05%) Ice Lake Totals: Instrs: 150499705 -> 150502608 (+0.00%); split: -0.00%, +0.00% Cycle count: 15105629116 -> 15105425880 (-0.00%); split: -0.00%, +0.00% Spill count: 60087 -> 60090 (+0.00%) Fill count: 100542 -> 100541 (-0.00%); split: -0.00%, +0.00% Max live registers: 32605215 -> 32605495 (+0.00%) Max dispatch width: 5617752 -> 5617792 (+0.00%); split: +0.00%, -0.00% Totals from 5882 (0.93% of 634934) affected shaders: Instrs: 4737206 -> 4740109 (+0.06%); split: -0.04%, +0.10% Cycle count: 598882104 -> 598678868 (-0.03%); split: -0.08%, +0.05% Spill count: 10278 -> 10281 (+0.03%) Fill count: 22504 -> 22503 (-0.00%); split: -0.01%, +0.01% Max live registers: 424184 -> 424464 (+0.07%) Max dispatch width: 50216 -> 50256 (+0.08%); split: +0.25%, -0.18% Skylake Totals: Instrs: 139092612 -> 139095257 (+0.00%); split: -0.00%, +0.00% Cycle count: 14533550285 -> 14533544716 (-0.00%); split: -0.00%, +0.00% Spill count: 58176 -> 58172 (-0.01%) Fill count: 95877 -> 95796 (-0.08%) Max live registers: 31924594 -> 31924874 (+0.00%) Max dispatch width: 5484568 -> 5484552 (-0.00%); split: +0.00%, -0.00% Totals from 5789 (0.93% of 625512) affected shaders: Instrs: 4481987 -> 4484632 (+0.06%); split: -0.04%, +0.10% Cycle count: 578310124 -> 578304555 (-0.00%); split: -0.05%, +0.05% Spill count: 9248 -> 9244 (-0.04%) Fill count: 19677 -> 19596 (-0.41%) Max live registers: 415340 -> 415620 (+0.07%) Max dispatch width: 49720 -> 49704 (-0.03%); split: +0.10%, -0.13% Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29095> 2024-05-01 21:02:13 -07:00			`# Copyright © 2024 Intel Corporation`
			`# SPDX-License-Identifier: MIT`

			`import argparse`
			`import sys`
			`from math import pi`

			`a = 'a'`
			`b = 'b'`

			`lower_fsign = [`
			`# This matches the behavior of the old optimization in brw_fs_nir.cpp, but`
			`# it has some problems.`
			`#`
			`# The fmul version passes Vulkan float_controls2 CTS a little bit by`
			`# luck. The use of fne means that the false path (i.e., fsign(X) == 0) is`
			`# only taken when X is zero. For OpenCL, this path should also be taken`
			`# when when X is NaN. This can be handled by using 'fabs(X) > 0', but this`
			`# fails float_controls2 CTS when the other multiplication operand is NaN.`
			`#`
			`# This optimization is additionally problematic when fsign(X) is zero and`
			`# the other multiplication operand is Inf. This will result in 0, but it`
			`# should result in NaN. This does not seem to be tested by the CTS.`
			`#`
			`# NOTE: fcsel opcodes are currently limited to float32 in NIR.`
			`(('fmul@32', ('fsign(is_used_once)', a), b), ('fcsel', a , ('ixor', ('iand', a, 0x80000000), b), ('iand', a, 0x80000000))),`
			`(('fmul@16', ('fsign(is_used_once)', a), b), ('bcsel', ('fneu', a, 0), ('ixor', ('iand', a, 0x8000 ), b), ('iand', a, 0x8000 ))),`

			`# This is 99.99% strictly correct for OpenCL. It will provide correctly`
			`# signed zero for ±0 inputs, and it will provide zero for NaN inputs. The`
			`# only slight deviation is that it can provide -0 for some NaN inputs.`
			`(('fsign@32', a), ('fcsel_gt', ('fabs', a) , ('ior', ('iand', a, 0x80000000), 0x3f800000), ('iand', a, 0x80000000))),`
			`(('fsign@16', a), ('bcsel', ('!flt', 0, ('fabs', a)), ('ior', ('iand', a, 0x8000 ), 0x3c00 ), ('iand', a, 0x8000 ))),`
			`]`

			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('-p', '--import-path', required=True)`
			`args = parser.parse_args()`
			`sys.path.insert(0, args.import_path)`
			`run()`


			`def run():`
			`import nir_algebraic # pylint: disable=import-error`

			`print('#include "brw_nir.h"')`

			`print(nir_algebraic.AlgebraicPass("brw_nir_lower_fsign", lower_fsign).render())`


			`if __name__ == '__main__':`
			`main()`