From 1c7e35d4e00632188787fbfffb5a9882db9cc48c Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Sat, 6 Jul 2024 11:24:43 -0700 Subject: [PATCH] nir/algebraic: Optimize some bit operation nonsense observed in some shaders In updates (not post at the time of this writing) to !29884, a change caused many spill and fill regressions shader for OpenGL Tomb Raider. While looking at that shader, I noticed some odd patterns. I initially added these patterns to counteract the regressions caused by the other change, but I had no luck. On Ice Lake... this cuts 99 instructions from the shader. shader-db: All Intel platforms had simliar results. (Meteor Lake shown) total instructions in shared programs: 19732341 -> 19732295 (<.01%) instructions in affected programs: 1744 -> 1698 (-2.64%) helped: 1 / HURT: 0 total cycles in shared programs: 916273716 -> 916273068 (<.01%) cycles in affected programs: 14266 -> 13618 (-4.54%) helped: 1 / HURT: 0 fossil-db: All Intel platforms had similar results. (Meteor Lake shown) Totals: Instrs: 151519575 -> 151519393 (-0.00%) Cycle count: 17208402120 -> 17208246858 (-0.00%); split: -0.00%, +0.00% Totals from 159 (0.03% of 630198) affected shaders: Instrs: 51970 -> 51788 (-0.35%) Cycle count: 11474176 -> 11318914 (-1.35%); split: -1.36%, +0.01% Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_opt_algebraic.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index e7287e20db7..9daf7b62ba5 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -1876,6 +1876,22 @@ optimizations.extend([ (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), (('ior', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), + (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)), + ('pack_half_2x16_split', c, a)), + + # Part of the BFI operation is src2&~src0. This expands to (b & 3) & ~0xc + # which is (b & 3) & 3. + (('bfi', 0x0000000c, a, ('iand', b, 3)), ('bfi', 0x0000000c, a, b)), + + # The important part here is that ~0xf & 0xfffffffc = ~0xf. + (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc), + ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)), + (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc), + ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)), + + # 0x0f << 3 == 0x78, so that's already the maximum possible value. + (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)), + (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),