From c3db34a525083504cc8bdd6fd9ab5736a458a445 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 6 May 2026 15:16:41 +0100 Subject: [PATCH] nir/algebraic: optimize ishl(iadd(ishl, ishl)) This reduces arithmetic for cooperative matrix loads: v_mbcnt_lo_u32_b32 v0, -1, 0 v_and_b32_e32 v1, 15, v0 v_lshrrev_b32_e32 v0, 4, v0 v_lshlrev_b32_e32 v1, 4, v1 v_lshl_add_u32 v0, v0, 3, v1 v_lshlrev_b32_e32 v0, 1, v0 -> v_mbcnt_lo_u32_b32 v0, -1, 0 v_and_b32_e32 v1, -16, v0 v_and_b32_e32 v0, 15, v0 v_lshl_add_u32 v0, v0, 5, v1 fossil-db (gfx1201): Totals from 38 (0.02% of 208640) affected shaders: Instrs: 42234 -> 42181 (-0.13%) CodeSize: 232656 -> 232384 (-0.12%) Latency: 128807 -> 128759 (-0.04%) InvThroughput: 20860 -> 20850 (-0.05%) VALU: 23035 -> 23013 (-0.10%) SALU: 4790 -> 4784 (-0.13%) fossil-db (gfx1201, dEQP-VK.compute.pipeline.cooperative_matrix.*): Totals from 44 (2.71% of 1623) affected shaders: Instrs: 46834 -> 46802 (-0.07%) CodeSize: 287536 -> 287272 (-0.09%) Latency: 100960 -> 100918 (-0.04%); split: -0.10%, +0.06% InvThroughput: 21808 -> 21796 (-0.06%) VALU: 19336 -> 19328 (-0.04%) SALU: 10790 -> 10782 (-0.07%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_opt_algebraic.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 50f50029067..def08e7ada3 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -605,19 +605,24 @@ for s in [8, 16, 32, 64]: mask = s - 1 ishl = "ishl@{}".format(s) + ishl_once = "ishl@{}(is_used_once)".format(s) ishr = "ishr@{}".format(s) ushr = "ushr@{}".format(s) - in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) + in_bounds = lambda x, y: ('ult', ('iadd', ('iand', x, mask), ('iand', y, mask)), s) optimizations.extend([ - ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), - ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), + ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ishl, a, ('iadd', b, c)), 0)), + ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds(b, c), (ushr, a, ('iadd', b, c)), 0)), # To get get -1 for large shifts of negative values, ishr must instead # clamp the shift count to the maximum value. ((ishr, (ishr, a, '#b'), '#c'), (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), + + ((ishl, ('iadd(is_used_once)', (ishl_once, a, '#b'), (ishl_once, c, '#d')), '#e'), + ('iadd', ('bcsel', in_bounds(b, e), ('ishl', a, ('iand', ('iadd', b, e), mask)), 0), + ('bcsel', in_bounds(d, e), ('ishl', c, ('iand', ('iadd', d, e), mask)), 0))), ]) # Optimize a pattern of address calculation created by DXVK where the offset is