From f19b9eddb6f9907adb1d1c41a126468dbd355f68 Mon Sep 17 00:00:00 2001 From: Lorenzo Rossi Date: Mon, 30 Mar 2026 10:47:54 +0200 Subject: [PATCH] pan/compiler: Replace bi_lower_ldexp16 with algebraic pass Signed-off-by: Lorenzo Rossi Reviewed-by: Faith Ekstrand Reviewed-by: Christoph Pillmayer Part-of: --- .../compiler/bifrost/bifrost_compile.c | 46 ------------------- .../compiler/bifrost/bifrost_nir_algebraic.py | 23 ++++++++++ 2 files changed, 23 insertions(+), 46 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index d177237a169..c393fd16121 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -5972,49 +5972,6 @@ bifrost_nir_lower_load_output(nir_shader *nir) nir_metadata_control_flow, NULL); } -/* Bifrost LDEXP.v2f16 takes i16 exponent, while nir_op_ldexp takes i32. Lower - * to nir_op_ldexp16_pan. */ -static bool -bi_lower_ldexp16(nir_builder *b, nir_alu_instr *alu, UNUSED void *data) -{ - if (alu->op != nir_op_ldexp || alu->def.bit_size != 16) - return false; - - b->cursor = nir_before_instr(&alu->instr); - - nir_def *x = nir_ssa_for_alu_src(b, alu, 0); - nir_def *exp32 = nir_ssa_for_alu_src(b, alu, 1); - - /* From the GLSL 4.60 spec (section 8.3): - * - * "If exp is greater than +128 (single-precision) or +1024 - * (double-precision), the value returned is undefined. If exp is less - * than -126 (single- precision) or -1022 (double-precision), the value - * returned may be flushed to zero." - * - * So we can't just truncate the exponent. Overflow is undefined behavior, - * but we need to return signed zero on underflow. If exp32 < INT16_MIN, we - * can use any 16-bit exponent that's sufficiently small to send all f16 - * values to zero. - * - * If we test exp32 < INT16_MIN directly, the comparison could not be - * vectorized, so instead we test the upper half. - */ - nir_def *exp16_high = nir_unpack_32_2x16_split_y(b, exp32); - nir_def *underflow = nir_ilt16(b, exp16_high, nir_imm_intN_t(b, -1, 16)); - - /* TODO: some possible values for this constant can be encoded as small - * immediates on valhall. None of the usable immediates have replicated i16 - * lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of which - * are small enough. */ - nir_def *min_exp = nir_imm_intN_t(b, -127, 16); - nir_def *exp16 = nir_b16csel(b, underflow, min_exp, nir_i2i16(b, exp32)); - - nir_def_replace(&alu->def, nir_ldexp16_pan(b, x, exp16)); - - return true; -} - void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) { @@ -6171,9 +6128,6 @@ bifrost_postprocess_nir(nir_shader *nir, unsigned gpu_id) NIR_PASS(_, nir, nir_lower_idiv, &(nir_lower_idiv_options){.allow_fp16 = true}); - NIR_PASS(_, nir, nir_shader_alu_pass, bi_lower_ldexp16, - nir_metadata_control_flow, NULL); - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); NIR_PASS(_, nir, nir_lower_load_const_to_scalar); NIR_PASS(_, nir, nir_lower_phis_to_scalar, bi_vectorize_filter, &gpu_id); diff --git a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py index 3a5ded6b8ce..7988355d1fd 100644 --- a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py +++ b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py @@ -122,6 +122,29 @@ for bsz in [8, 16, 32]: ((f'b2f{fsz}', f'a@{bsz}'), (f'b{fsz}csel', a_fsz, 1.0, 0.0)), ] +# Bifrost LDEXP.v2f16 takes i16 exponent, while nir_op_ldexp takes i32. Lower +# to nir_op_ldexp16_pan. +# +# From the GLSL 4.60 spec (section 8.3): +# "If exp is greater than +128 (single-precision) or +1024 +# (double-precision), the value returned is undefined. If exp is less +# than -126 (single- precision) or -1022 (double-precision), the value +# returned may be flushed to zero." +# +# So we can't just truncate the exponent. Overflow is undefined behavior, but +# we need to return signed zero on underflow. If exp32 < INT16_MIN, we can use +# any 16-bit exponent that's sufficiently small to send all f16 values to zero. +# +# If we test exp32 < INT16_MIN directly, the comparison could not be +# vectorized, so instead we test the upper half. +# TODO: some possible values for -127 can be encoded as small +# immediates on valhall. None of the usable immediates have replicated +# i16 lanes, but for example 0xFAFCFDFE would be {-1284,-514}, both of +# which are small enough. +algebraic_late += [ + (('ldexp', 'a@16', b), ('ldexp16_pan', a, ('b16csel', ('ilt16', ('unpack_32_2x16_split_y', b), -1), -127, ('i2i16', b)))) +] + def main(): parser = argparse.ArgumentParser()