From c3db34a525083504cc8bdd6fd9ab5736a458a445 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Wed, 6 May 2026 15:16:41 +0100
Subject: [PATCH] nir/algebraic: optimize ishl(iadd(ishl, ishl))

This reduces arithmetic for cooperative matrix loads:
v_mbcnt_lo_u32_b32 v0, -1, 0
v_and_b32_e32 v1, 15, v0
v_lshrrev_b32_e32 v0, 4, v0
v_lshlrev_b32_e32 v1, 4, v1
v_lshl_add_u32 v0, v0, 3, v1
v_lshlrev_b32_e32 v0, 1, v0
->
v_mbcnt_lo_u32_b32 v0, -1, 0
v_and_b32_e32 v1, -16, v0
v_and_b32_e32 v0, 15, v0
v_lshl_add_u32 v0, v0, 5, v1

fossil-db (gfx1201):
Totals from 38 (0.02% of 208640) affected shaders:
Instrs: 42234 -> 42181 (-0.13%)
CodeSize: 232656 -> 232384 (-0.12%)
Latency: 128807 -> 128759 (-0.04%)
InvThroughput: 20860 -> 20850 (-0.05%)
VALU: 23035 -> 23013 (-0.10%)
SALU: 4790 -> 4784 (-0.13%)

fossil-db (gfx1201, dEQP-VK.compute.pipeline.cooperative_matrix.*):
Totals from 44 (2.71% of 1623) affected shaders:
Instrs: 46834 -> 46802 (-0.07%)
CodeSize: 287536 -> 287272 (-0.09%)
Latency: 100960 -> 100918 (-0.04%); split: -0.10%, +0.06%
InvThroughput: 21808 -> 21796 (-0.06%)
VALU: 19336 -> 19328 (-0.04%)
SALU: 10790 -> 10782 (-0.07%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41653>
---
 src/compiler/nir/nir_opt_algebraic.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 50f50029067..def08e7ada3 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -605,19 +605,24 @@ for s in [8, 16, 32, 64]:
    mask = s - 1
 
    ishl = "ishl@{}".format(s)
+   ishl_once = "ishl@{}(is_used_once)".format(s)
    ishr = "ishr@{}".format(s)
    ushr = "ushr@{}".format(s)
 
-   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+   in_bounds = lambda x, y: ('ult', ('iadd', ('iand', x, mask), ('iand', y, mask)), s)
 
    optimizations.extend([
-       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
-       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ishl, a, ('iadd', b, c)), 0)),
+       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ushr, a, ('iadd', b, c)), 0)),
 
        # To get get -1 for large shifts of negative values, ishr must instead
        # clamp the shift count to the maximum value.
        ((ishr, (ishr, a, '#b'), '#c'),
         (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+
+       ((ishl, ('iadd(is_used_once)', (ishl_once, a, '#b'), (ishl_once, c, '#d')), '#e'),
+        ('iadd', ('bcsel', in_bounds(b, e), ('ishl', a, ('iand', ('iadd', b, e), mask)), 0),
+                 ('bcsel', in_bounds(d, e), ('ishl', c, ('iand', ('iadd', d, e), mask)), 0))),
    ])
 
 # Optimize a pattern of address calculation created by DXVK where the offset is