nir/opt_algebraic: use imul24_relaxed for lowered dot4x8_add

Totals from 28 (0.04% of 72819) affected shaders: (Navi10)

MaxWaves: 181 -> 186 (+2.76%)
Instrs: 406735 -> 338360 (-16.81%)
CodeSize: 2913588 -> 2469712 (-15.23%)
VGPRs: 5520 -> 5468 (-0.94%)
SpillVGPRs: 32 -> 0 (-inf%)
LDS: 64512 -> 62464 (-3.17%)
Scratch: 10240 -> 0 (-inf%)
Latency: 11028252 -> 4357120 (-60.49%)
InvThroughput: 11004126 -> 4079018 (-62.93%)
VClause: 1686 -> 2055 (+21.89%); split: -0.89%, +22.78%
SClause: 890 -> 852 (-4.27%)
Copies: 4516 -> 2644 (-41.45%); split: -41.59%, +0.13%
PreSGPRs: 982 -> 974 (-0.81%)
PreVGPRs: 5356 -> 4284 (-20.01%)
VALU: 370529 -> 330201 (-10.88%)
SALU: 28850 -> 1170 (-95.94%)
VMEM: 2616 -> 2560 (-2.14%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41178>
This commit is contained in:
Daniel Schürmann 2026-04-24 08:26:53 +02:00 committed by Marge Bot
parent fe067b17d9
commit 708093d830

View file

@ -410,22 +410,22 @@ for sz in (16, 32, 64):
# Shorthand for the expansion of just the dot product part of the [iu]dp4a
# instructions.
sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
sdot_4x8_a_b = ('iadd', ('iadd', ('imul24_relaxed', ('extract_i8', a, 0), ('extract_i8', b, 0)),
('imul24_relaxed', ('extract_i8', a, 1), ('extract_i8', b, 1))),
('iadd', ('imul24_relaxed', ('extract_i8', a, 2), ('extract_i8', b, 2)),
('imul24_relaxed', ('extract_i8', a, 3), ('extract_i8', b, 3))))
udot_4x8_a_b = ('iadd', ('iadd', ('umul24_relaxed', ('extract_u8', a, 0), ('extract_u8', b, 0)),
('umul24_relaxed', ('extract_u8', a, 1), ('extract_u8', b, 1))),
('iadd', ('umul24_relaxed', ('extract_u8', a, 2), ('extract_u8', b, 2)),
('umul24_relaxed', ('extract_u8', a, 3), ('extract_u8', b, 3))))
sudot_4x8_a_b = ('iadd', ('iadd', ('imul24_relaxed', ('extract_i8', a, 0), ('extract_u8', b, 0)),
('imul24_relaxed', ('extract_i8', a, 1), ('extract_u8', b, 1))),
('iadd', ('imul24_relaxed', ('extract_i8', a, 2), ('extract_u8', b, 2)),
('imul24_relaxed', ('extract_i8', a, 3), ('extract_u8', b, 3))))
sdot_2x16_a_b = ('iadd', ('imul24_relaxed', ('extract_i16', a, 0), ('extract_i16', b, 0)),
('imul24_relaxed', ('extract_i16', a, 1), ('extract_i16', b, 1)))
udot_2x16_a_b = ('iadd', ('umul24_relaxed', ('extract_u16', a, 0), ('extract_u16', b, 0)),
('umul24_relaxed', ('extract_u16', a, 1), ('extract_u16', b, 1)))
optimizations.extend([
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),