nir/algebraic: optimize ishl(iadd(ishl, ishl))

This reduces arithmetic for cooperative matrix loads: v_mbcnt_lo_u32_b32 v0, -1, 0 v_and_b32_e32 v1, 15, v0 v_lshrrev_b32_e32 v0, 4, v0 v_lshlrev_b32_e32 v1, 4, v1 v_lshl_add_u32 v0, v0, 3, v1 v_lshlrev_b32_e32 v0, 1, v0 -> v_mbcnt_lo_u32_b32 v0, -1, 0 v_and_b32_e32 v1, -16, v0 v_and_b32_e32 v0, 15, v0 v_lshl_add_u32 v0, v0, 5, v1 fossil-db (gfx1201): Totals from 38 (0.02% of 208640) affected shaders: Instrs: 42234 -> 42181 (-0.13%) CodeSize: 232656 -> 232384 (-0.12%) Latency: 128807 -> 128759 (-0.04%) InvThroughput: 20860 -> 20850 (-0.05%) VALU: 23035 -> 23013 (-0.10%) SALU: 4790 -> 4784 (-0.13%) fossil-db (gfx1201, dEQP-VK.compute.pipeline.cooperative_matrix.*): Totals from 44 (2.71% of 1623) affected shaders: Instrs: 46834 -> 46802 (-0.07%) CodeSize: 287536 -> 287272 (-0.09%) Latency: 100960 -> 100918 (-0.04%); split: -0.10%, +0.06% InvThroughput: 21808 -> 21796 (-0.06%) VALU: 19336 -> 19328 (-0.04%) SALU: 10790 -> 10782 (-0.07%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41653>
2026-06-04 00:08:16 +02:00 · 2026-05-06 15:16:41 +01:00 · 2026-05-06 15:16:41 +01:00 · c3db34a525
commit c3db34a525
parent b1c40839f2
1 changed files with 8 additions and 3 deletions
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -605,19 +605,24 @@ for s in [8, 16, 32, 64]:
   mask = s - 1

   ishl = "ishl@{}".format(s)
+   ishl_once = "ishl@{}(is_used_once)".format(s)
   ishr = "ishr@{}".format(s)
   ushr = "ushr@{}".format(s)

-   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+   in_bounds = lambda x, y: ('ult', ('iadd', ('iand', x, mask), ('iand', y, mask)), s)

   optimizations.extend([
-       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
-       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ishl, a, ('iadd', b, c)), 0)),
+       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ushr, a, ('iadd', b, c)), 0)),

       # To get get -1 for large shifts of negative values, ishr must instead
       # clamp the shift count to the maximum value.
       ((ishr, (ishr, a, '#b'), '#c'),
        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+
+       ((ishl, ('iadd(is_used_once)', (ishl_once, a, '#b'), (ishl_once, c, '#d')), '#e'),
+        ('iadd', ('bcsel', in_bounds(b, e), ('ishl', a, ('iand', ('iadd', b, e), mask)), 0),
+                 ('bcsel', in_bounds(d, e), ('ishl', c, ('iand', ('iadd', d, e), mask)), 0))),
   ])

 # Optimize a pattern of address calculation created by DXVK where the offset is