From 708093d830684df3f1d60df3c3f96a381f68c951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Fri, 24 Apr 2026 08:26:53 +0200 Subject: [PATCH] nir/opt_algebraic: use imul24_relaxed for lowered dot4x8_add Totals from 28 (0.04% of 72819) affected shaders: (Navi10) MaxWaves: 181 -> 186 (+2.76%) Instrs: 406735 -> 338360 (-16.81%) CodeSize: 2913588 -> 2469712 (-15.23%) VGPRs: 5520 -> 5468 (-0.94%) SpillVGPRs: 32 -> 0 (-inf%) LDS: 64512 -> 62464 (-3.17%) Scratch: 10240 -> 0 (-inf%) Latency: 11028252 -> 4357120 (-60.49%) InvThroughput: 11004126 -> 4079018 (-62.93%) VClause: 1686 -> 2055 (+21.89%); split: -0.89%, +22.78% SClause: 890 -> 852 (-4.27%) Copies: 4516 -> 2644 (-41.45%); split: -41.59%, +0.13% PreSGPRs: 982 -> 974 (-0.81%) PreVGPRs: 5356 -> 4284 (-20.01%) VALU: 370529 -> 330201 (-10.88%) SALU: 28850 -> 1170 (-95.94%) VMEM: 2616 -> 2560 (-2.14%) Part-of: --- src/compiler/nir/nir_opt_algebraic.py | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index ebcaae2c77e..5d141cf6fb9 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -410,22 +410,22 @@ for sz in (16, 32, 64): # Shorthand for the expansion of just the dot product part of the [iu]dp4a # instructions. -sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), - ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), - ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), - ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) -udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), - ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), - ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), - ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) -sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), - ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), - ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), - ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) -sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), - ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) -udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), - ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) +sdot_4x8_a_b = ('iadd', ('iadd', ('imul24_relaxed', ('extract_i8', a, 0), ('extract_i8', b, 0)), + ('imul24_relaxed', ('extract_i8', a, 1), ('extract_i8', b, 1))), + ('iadd', ('imul24_relaxed', ('extract_i8', a, 2), ('extract_i8', b, 2)), + ('imul24_relaxed', ('extract_i8', a, 3), ('extract_i8', b, 3)))) +udot_4x8_a_b = ('iadd', ('iadd', ('umul24_relaxed', ('extract_u8', a, 0), ('extract_u8', b, 0)), + ('umul24_relaxed', ('extract_u8', a, 1), ('extract_u8', b, 1))), + ('iadd', ('umul24_relaxed', ('extract_u8', a, 2), ('extract_u8', b, 2)), + ('umul24_relaxed', ('extract_u8', a, 3), ('extract_u8', b, 3)))) +sudot_4x8_a_b = ('iadd', ('iadd', ('imul24_relaxed', ('extract_i8', a, 0), ('extract_u8', b, 0)), + ('imul24_relaxed', ('extract_i8', a, 1), ('extract_u8', b, 1))), + ('iadd', ('imul24_relaxed', ('extract_i8', a, 2), ('extract_u8', b, 2)), + ('imul24_relaxed', ('extract_i8', a, 3), ('extract_u8', b, 3)))) +sdot_2x16_a_b = ('iadd', ('imul24_relaxed', ('extract_i16', a, 0), ('extract_i16', b, 0)), + ('imul24_relaxed', ('extract_i16', a, 1), ('extract_i16', b, 1))) +udot_2x16_a_b = ('iadd', ('umul24_relaxed', ('extract_u16', a, 0), ('extract_u16', b, 0)), + ('umul24_relaxed', ('extract_u16', a, 1), ('extract_u16', b, 1))) optimizations.extend([ (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),