nak: add algebraic patterns to improve MUFU.F16

Doesn't really help many shaders, but I've seen a couple that turn from MUFU into F2F(MUFU.F16(F2F)). Though this might be as well a limitation of related code, e.g. returning F32 from TEX, and not use TEX.F16 instead. Totals: CodeSize: 8662337424 -> 8662336960 (-0.00%) Static cycle count: 4718044491 -> 4718044554 (+0.00%); split: -0.00%, +0.00% Totals from 7 (0.00% of 1163204) affected shaders: CodeSize: 236480 -> 236016 (-0.20%) Static cycle count: 2108061 -> 2108124 (+0.00%); split: -0.01%, +0.01% Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40392>
2026-05-01 05:58:05 +02:00 · 2026-03-21 15:17:22 +01:00 · 2026-03-21 15:17:22 +01:00 · 72e9f9a760
commit 72e9f9a760
parent 9cc2cd843b
1 changed files with 35 additions and 0 deletions
--- a/src/nouveau/compiler/nak_nir_algebraic.py
+++ b/src/nouveau/compiler/nak_nir_algebraic.py
@ -31,6 +31,7 @@ s = 's'

 # common conditions to improve readability
 volta = 'nak->sm >= 70 && nak->sm < 73'
+fp16_round_is_rtz = 'nir_is_rounding_mode_rtz(info->float_controls_execution_mode, 16)'

 algebraic_lowering = [
    # Volta doesn't have `IMNMX`
@ -56,6 +57,40 @@ for f2f16 in ['f2f16', 'f2f16_rtz', 'f2f16_rtne']:
        (('vec2', (f2f16 + '(is_used_once)', 'a@32'), (f2f16 + '(is_used_once)', 'b@32')), (f2f16, ('vec2', a, b)), 'nak->sm >= 86')
    ]

+# If we find mufu surrounded by bit_size conversions, just do the op in the
+# original bit_size.
+# MUFU.F16 internally appears to operate with the same precision as F32 does
+# with the result being rounded towards zero to F16. EXP2 and RCP seem to be
+# off by one around Inf, so it's only safe if we can ignore inf for those.
+#
+# This was verified with the `hw_tests::test_op_mufu_f16_down`.
+
+# mufu.f16 for those is identical to mufu.f32 with rtz rounding except for results around infinity
+for op in ['fexp2', 'frcp']:
+    algebraic_lowering += [
+        (('f2f16_rtz(ninf)', (op + '(is_used_once)', ('f2f32', 'a@16'))), (op, a), 'nak->sm >= 73'),
+        (('f2f16(ninf)', (op + '(is_used_once)', ('f2f32', 'a@16'))),
+            (op, a),
+            'nak->sm >= 73 && ' + fp16_round_is_rtz),
+    ]
+
+# mufu.f16 for those is identical to mufu.f32 with rtz rounding
+for op in ['fcos_normalized_2_pi', 'flog2', 'frsq', 'fsin_normalized_2_pi', 'fsqrt']:
+    algebraic_lowering += [
+        (('f2f16_rtz', (op + '(is_used_once)', ('f2f32', 'a@16'))), (op, a), 'nak->sm >= 73'),
+        (('f2f16', (op + '(is_used_once)', ('f2f32', 'a@16'))),
+            (op, a),
+            'nak->sm >= 73 && ' + fp16_round_is_rtz),
+    ]
+
+# If contract is on we can always remove the conversions
+for op in ['fcos_normalized_2_pi', 'fexp2', 'flog2', 'frcp', 'frsq', 'fsin_normalized_2_pi', 'fsqrt']:
+    for f2f16 in ['f2f16_rtz', 'f2f16_rtne', 'f2f16']:
+        algebraic_lowering += [
+            ((f2f16 + '(contract)', (op + '(is_used_once)', ('f2f32', 'a@16'))),
+                (op, a), 'nak->sm >= 73'),
+            (('f2f32(contract)', (op + '(is_used_once)', (f2f16, 'a@32'))), (op, a)),
+    ]

 def main():
    parser = argparse.ArgumentParser()