panfrost: implement 16-bit ldexp

Bifrost LDEXP.v2f16 takes a 16-bit exponent, which requires messy lowering. The codegen for this is quite bad currently, but would be improved by implementing unpack_32_2x16_split_*, and by fusing comparisons with CSEL. The main alternative is converting to F32, then LDEXP.f32, then converting back to F16. This has better codegen for dynamic exponents currently, but worse in the common case with a constant exponent where all the saturating cast logic can be folded. Fixes dEQP-VK.glsl.builtin.precision_fp16_storage16b.ldexp.compute.vec2 when shaderFloat16 is enabled in panvk. Signed-off-by: Benjamin Lee <benjamin.lee@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Acked-by: Rebecca Mckeever <rebecca.mckeever@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33637>
2026-05-05 16:08:04 +02:00 · 2025-02-20 11:08:31 -08:00 · 2025-02-20 11:08:31 -08:00 · 252c59602e
commit 252c59602e
parent 2a70665df7
3 changed files with 62 additions and 0 deletions
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@ -1438,6 +1438,16 @@ unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
 unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
 unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)")

+# panfrost-specific opcodes
+
+# 16-bit ldexp with 16-bit exponent for bifrost
+opcode("ldexp16_pan", 0, tfloat16, [0, 0], [tfloat16, tint16], False, "", """
+dst = ldexpf(src0, src1);
+/* flush denormals to zero. */
+if (!isnormal(dst))
+   dst = copysignf(0.0f, src0);
+""")
+
 # vc4-specific opcodes

 # Saturated vector add for 4 8bit ints.
--- a/src/panfrost/compiler/bi_lower_swizzle.c
+++ b/src/panfrost/compiler/bi_lower_swizzle.c
@ -108,6 +108,7 @@ lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
         break;

   /* No swizzles supported */
+   case BI_OPCODE_LDEXP_V2F16:
   case BI_OPCODE_HADD_V4U8:
   case BI_OPCODE_HADD_V4S8:
   case BI_OPCODE_CLZ_V4U8:
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -2911,6 +2911,12 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
      break;

   case nir_op_ldexp:
+      assert(sz == 32 && "should be lowered");
+      bi_ldexp_to(b, sz, dst, s0, s1);
+      break;
+
+   case nir_op_ldexp16_pan:
+      assert(sz == 16);
      bi_ldexp_to(b, sz, dst, s0, s1);
      break;

@ -5484,6 +5490,49 @@ bi_lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
   return true;
 }

+/* Bifrost LDEXP.v2f16 takes i16 exponent, while nir_op_ldexp takes i32. Lower
+ * to nir_op_ldexp16_pan. */
+static bool
+bi_lower_ldexp16(nir_builder *b, nir_alu_instr *alu, UNUSED void *data)
+{
+   if (alu->op != nir_op_ldexp || alu->def.bit_size != 16)
+      return false;
+
+   b->cursor = nir_before_instr(&alu->instr);
+
+   nir_def *x = nir_ssa_for_alu_src(b, alu, 0);
+   nir_def *exp32 = nir_ssa_for_alu_src(b, alu, 1);
+
+   /* From the GLSL 4.60 spec (section 8.3):
+    *
+    *    "If exp is greater than +128 (single-precision) or +1024
+    *     (double-precision), the value returned is undefined. If exp is less
+    *     than -126 (single- precision) or -1022 (double-precision), the value
+    *     returned may be flushed to zero."
+    *
+    * So we can't just truncate the exponent. Overflow is undefined behavior,
+    * but we need to return signed zero on underflow. If exp32 < INT16_MIN, we
+    * can use any 16-bit exponent that's sufficiently small to send all f16
+    * values to zero.
+    *
+    * If we test exp32 < INT16_MIN directly, the comparison could not be
+    * vectorized, so instead we test the upper half.
+    */
+   nir_def *exp16_high = nir_unpack_32_2x16_split_y(b, exp32);
+   nir_def *underflow = nir_ilt16(b, exp16_high, nir_imm_intN_t(b, -1, 16));
+
+   /* TODO: some possible values for this constant can be encoded as small
+    * immediates on valhall. None of the usable immediates have replicated i16
+    * lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of which
+    * are small enough. */
+   nir_def *min_exp = nir_imm_intN_t(b, -127, 16);
+   nir_def *exp16 = nir_b16csel(b, underflow, min_exp, nir_i2i16(b, exp32));
+
+   nir_def_replace(&alu->def, nir_ldexp16_pan(b, x, exp16));
+
+   return true;
+}
+
 void
 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
@ -5642,6 +5691,8 @@ bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)

   NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups,
            nir_metadata_control_flow, &gpu_id);
+   NIR_PASS(_, nir, nir_shader_alu_pass, bi_lower_ldexp16,
+            nir_metadata_control_flow, NULL);

   NIR_PASS(_, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);