From 1c7e35d4e00632188787fbfffb5a9882db9cc48c Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Sat, 6 Jul 2024 11:24:43 -0700
Subject: [PATCH] nir/algebraic: Optimize some bit operation nonsense observed
 in some shaders

In updates (not post at the time of this writing) to !29884, a change
caused many spill and fill regressions shader for OpenGL Tomb
Raider. While looking at that shader, I noticed some odd patterns. I
initially added these patterns to counteract the regressions caused by
the other change, but I had no luck. On Ice Lake... this cuts 99
instructions from the shader.

shader-db:

All Intel platforms had simliar results. (Meteor Lake shown)
total instructions in shared programs: 19732341 -> 19732295 (<.01%)
instructions in affected programs: 1744 -> 1698 (-2.64%)
helped: 1 / HURT: 0

total cycles in shared programs: 916273716 -> 916273068 (<.01%)
cycles in affected programs: 14266 -> 13618 (-4.54%)
helped: 1 / HURT: 0

fossil-db:

All Intel platforms had similar results. (Meteor Lake shown)
Totals:
Instrs: 151519575 -> 151519393 (-0.00%)
Cycle count: 17208402120 -> 17208246858 (-0.00%); split: -0.00%, +0.00%

Totals from 159 (0.03% of 630198) affected shaders:
Instrs: 51970 -> 51788 (-0.35%)
Cycle count: 11474176 -> 11318914 (-1.35%); split: -1.36%, +0.01%

Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30158>
---
 src/compiler/nir/nir_opt_algebraic.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index e7287e20db7..9daf7b62ba5 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -1876,6 +1876,22 @@ optimizations.extend([
    (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
    (('ior',  ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)),
 
+   (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)),
+    ('pack_half_2x16_split', c, a)),
+
+   # Part of the BFI operation is src2&~src0. This expands to (b & 3) & ~0xc
+   # which is (b & 3) & 3.
+   (('bfi', 0x0000000c, a, ('iand', b, 3)), ('bfi', 0x0000000c, a, b)),
+
+   # The important part here is that ~0xf & 0xfffffffc = ~0xf.
+   (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc),
+    ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)),
+   (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc),
+    ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)),
+
+   # 0x0f << 3 == 0x78, so that's already the maximum possible value.
+   (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)),
+
    (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
    (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
    (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),