From 352a8d6beb61767406026b2e2db6628210115e80 Mon Sep 17 00:00:00 2001
From: Eric Guo <eric.guo@nxp.com>
Date: Tue, 28 Apr 2026 17:02:22 +0800
Subject: [PATCH] pan/compiler: Clamp fp16 ldexp exponent range

Fix OpenCL-CTS error in `math_brute_force/test_bruteforce -w ldexp`

Valhall LDEXP.v2f16 takes a 16-bit exponent, while NIR ldexp uses a
32-bit exponent. Truncating large exponents can flip overflow into
underflow or leave huge 16-bit exponents to hardware behavior that does
not match OpenCL's expected signed infinity/zero results.

Clamp the exponent to a range sufficient to overflow or underflow all
fp16 values before lowering to ldexp16_pan.

Signed-off-by: Eric Guo <eric.guo@nxp.com>
Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41234>
---
 .../compiler/bifrost/bifrost_nir_algebraic.py  | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
index 7988355d1fd..257bf7d7bd6 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
+++ b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
@@ -131,18 +131,14 @@ for bsz in [8, 16, 32]:
 #     than -126 (single- precision) or -1022 (double-precision), the value
 #     returned may be flushed to zero."
 #
-# So we can't just truncate the exponent. Overflow is undefined behavior, but
-# we need to return signed zero on underflow. If exp32 < INT16_MIN, we can use
-# any 16-bit exponent that's sufficiently small to send all f16 values to zero.
-#
-# If we test exp32 < INT16_MIN directly, the comparison could not be
-# vectorized, so instead we test the upper half.
-# TODO: some possible values for -127 can be encoded as small
-#       immediates on valhall. None of the usable immediates have replicated
-#       i16 lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of
-#       which are small enough.
+# So we can't just truncate the exponent. Overflow is undefined behavior for
+# GLSL, but OpenCL expects us to return signed infinity, and we need to return
+# signed zero on underflow. Clamp to a range that's sufficient to overflow or
+# underflow all f16 values, avoiding implementation-defined behaviour for huge
+# exponents in LDEXP.v2f16.
 algebraic_late += [
-    (('ldexp', 'a@16', b), ('ldexp16_pan', a, ('b16csel', ('ilt16', ('unpack_32_2x16_split_y', b), -1), -127, ('i2i16', b))))
+    (('ldexp', 'a@16', b),
+     ('ldexp16_pan', a, ('i2i16', ('imin', ('imax', b, -127), 127))))
 ]