From f19b9eddb6f9907adb1d1c41a126468dbd355f68 Mon Sep 17 00:00:00 2001
From: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Date: Mon, 30 Mar 2026 10:47:54 +0200
Subject: [PATCH] pan/compiler: Replace bi_lower_ldexp16 with algebraic pass

Signed-off-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40717>
---
 .../compiler/bifrost/bifrost_compile.c        | 46 -------------------
 .../compiler/bifrost/bifrost_nir_algebraic.py | 23 ++++++++++
 2 files changed, 23 insertions(+), 46 deletions(-)

diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index d177237a169..c393fd16121 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -5972,49 +5972,6 @@ bifrost_nir_lower_load_output(nir_shader *nir)
       nir_metadata_control_flow, NULL);
 }
 
-/* Bifrost LDEXP.v2f16 takes i16 exponent, while nir_op_ldexp takes i32. Lower
- * to nir_op_ldexp16_pan. */
-static bool
-bi_lower_ldexp16(nir_builder *b, nir_alu_instr *alu, UNUSED void *data)
-{
-   if (alu->op != nir_op_ldexp || alu->def.bit_size != 16)
-      return false;
-
-   b->cursor = nir_before_instr(&alu->instr);
-
-   nir_def *x = nir_ssa_for_alu_src(b, alu, 0);
-   nir_def *exp32 = nir_ssa_for_alu_src(b, alu, 1);
-
-   /* From the GLSL 4.60 spec (section 8.3):
-    *
-    *    "If exp is greater than +128 (single-precision) or +1024
-    *     (double-precision), the value returned is undefined. If exp is less
-    *     than -126 (single- precision) or -1022 (double-precision), the value
-    *     returned may be flushed to zero."
-    *
-    * So we can't just truncate the exponent. Overflow is undefined behavior,
-    * but we need to return signed zero on underflow. If exp32 < INT16_MIN, we
-    * can use any 16-bit exponent that's sufficiently small to send all f16
-    * values to zero.
-    *
-    * If we test exp32 < INT16_MIN directly, the comparison could not be
-    * vectorized, so instead we test the upper half.
-    */
-   nir_def *exp16_high = nir_unpack_32_2x16_split_y(b, exp32);
-   nir_def *underflow = nir_ilt16(b, exp16_high, nir_imm_intN_t(b, -1, 16));
-
-   /* TODO: some possible values for this constant can be encoded as small
-    * immediates on valhall. None of the usable immediates have replicated i16
-    * lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of which
-    * are small enough. */
-   nir_def *min_exp = nir_imm_intN_t(b, -127, 16);
-   nir_def *exp16 = nir_b16csel(b, underflow, min_exp, nir_i2i16(b, exp32));
-
-   nir_def_replace(&alu->def, nir_ldexp16_pan(b, x, exp16));
-
-   return true;
-}
-
 void
 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
@@ -6171,9 +6128,6 @@ bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id)
    NIR_PASS(_, nir, nir_lower_idiv,
             &(nir_lower_idiv_options){.allow_fp16 = true});
 
-   NIR_PASS(_, nir, nir_shader_alu_pass, bi_lower_ldexp16,
-            nir_metadata_control_flow, NULL);
-
    NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id);
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, nir, nir_lower_phis_to_scalar, bi_vectorize_filter, &gpu_id);
diff --git a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
index 3a5ded6b8ce..7988355d1fd 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
+++ b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
@@ -122,6 +122,29 @@ for bsz in [8, 16, 32]:
             ((f'b2f{fsz}', f'a@{bsz}'), (f'b{fsz}csel', a_fsz, 1.0, 0.0)),
         ]
 
+# Bifrost LDEXP.v2f16 takes i16 exponent, while nir_op_ldexp takes i32. Lower
+# to nir_op_ldexp16_pan.
+#
+# From the GLSL 4.60 spec (section 8.3):
+#     "If exp is greater than +128 (single-precision) or +1024
+#     (double-precision), the value returned is undefined. If exp is less
+#     than -126 (single- precision) or -1022 (double-precision), the value
+#     returned may be flushed to zero."
+#
+# So we can't just truncate the exponent. Overflow is undefined behavior, but
+# we need to return signed zero on underflow. If exp32 < INT16_MIN, we can use
+# any 16-bit exponent that's sufficiently small to send all f16 values to zero.
+#
+# If we test exp32 < INT16_MIN directly, the comparison could not be
+# vectorized, so instead we test the upper half.
+# TODO: some possible values for -127 can be encoded as small
+#       immediates on valhall. None of the usable immediates have replicated
+#       i16 lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of
+#       which are small enough.
+algebraic_late += [
+    (('ldexp', 'a@16', b), ('ldexp16_pan', a, ('b16csel', ('ilt16', ('unpack_32_2x16_split_y', b), -1), -127, ('i2i16', b))))
+]
+
 
 def main():
     parser = argparse.ArgumentParser()