From 91c473e49ace2efbd6490cd63896743cddbc730a Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Thu, 20 Feb 2025 00:26:21 -0800 Subject: [PATCH] panfrost: fix large int32->float16 conversions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On vulkan, truncating to S/U16 before converting is not valid, because out-of-range conversions are specified to be correctly rounded. IEEE 754 requires that out-of-range values round to ±inf with RTNE and ±F16_MAX with RTZ. On gl, truncating is valid for U16->F16, because out-of-range int->float conversions are undefined behavior. For S16->F16, it is not valid because S16_MAX < F16_MAX, so some in-range values will be truncated as well. Instead, just handle S/U16->F16 as S/U16->F32->F16. Fixes dEQP-VK.spirv_assembly.instruction.compute.convertstof.int32_to_float16_* when shaderFloat16 is enabled in panvk. Signed-off-by: Benjamin Lee Fixes: be74b84e6f8 ("pan/bi: Fill in some more conversions") Reviewed-by: Boris Brezillon Acked-by: Rebecca Mckeever Part-of: (cherry picked from commit a33cd3def23dcf740ee04fb39e7b8bf1312d090c) --- .pick_status.json | 2 +- src/panfrost/compiler/bifrost_compile.c | 44 ++++++++++--------- .../compiler/bifrost_nir_algebraic.py | 4 ++ 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 66682dde2b5..f56cf69db54 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -54,7 +54,7 @@ "description": "panfrost: fix large int32->float16 conversions", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "be74b84e6f81c6a75e8488c0757ac7f911f4931d", "notes": null diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index 20864bbc28e..81ae3c71beb 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -2652,13 +2652,12 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) return; } - /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to - * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than - * scalarizing due to scheduling (equal cost on Valhall). Additionally - * if the source is replicated the MKVEC.v2i16 can be optimized out. - */ - case nir_op_u2f16: - case nir_op_i2f16: { + /* Pre-v11, we can get vector i2f32 by lowering 32-bit vector i2f16 to + * i2f32 + f2f16 in bifrost_nir_lower_algebraic_late, which runs after + * nir_opt_vectorize. We don't scalarize i2f32 earlier because we have + * vector V2F16_TO_V2F32. */ + case nir_op_i2f32: + case nir_op_u2f32: { if (!(src_sz == 32 && comps == 2)) break; @@ -2667,15 +2666,16 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) bi_index s0 = bi_extract(b, idx, src->swizzle[0]); bi_index s1 = bi_extract(b, idx, src->swizzle[1]); - bi_index t = - (src->swizzle[0] == src->swizzle[1]) - ? bi_half(s0, false) - : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false)); + bi_index d0, d1; + if (instr->op == nir_op_i2f32) { + d0 = bi_s32_to_f32(b, s0); + d1 = bi_s32_to_f32(b, s1); + } else { + d0 = bi_u32_to_f32(b, s0); + d1 = bi_u32_to_f32(b, s1); + } - if (instr->op == nir_op_u2f16) - bi_v2u16_to_v2f16_to(b, dst, t); - else - bi_v2s16_to_v2f16_to(b, dst, t); + bi_collect_v2i32_to(b, dst, d0, d1); return; } @@ -2969,9 +2969,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_u2f16: - if (src_sz == 32) - bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) + /* V2I32_TO_V2F16 does not exist */ + assert((src_sz == 16 || src_sz == 8) && "should be lowered"); + + if (src_sz == 16) bi_v2u16_to_v2f16_to(b, dst, s0); else if (src_sz == 8) bi_v2u8_to_v2f16_to(b, dst, s0); @@ -2987,9 +2988,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) break; case nir_op_i2f16: - if (src_sz == 32) - bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false)); - else if (src_sz == 16) + /* V2I32_TO_V2F16 does not exist */ + assert((src_sz == 16 || src_sz == 8) && "should be lowered"); + + if (src_sz == 16) bi_v2s16_to_v2f16_to(b, dst, s0); else if (src_sz == 8) bi_v2s8_to_v2f16_to(b, dst, s0); diff --git a/src/panfrost/compiler/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost_nir_algebraic.py index 362266569fc..36d11ce5e29 100644 --- a/src/panfrost/compiler/bifrost_nir_algebraic.py +++ b/src/panfrost/compiler/bifrost_nir_algebraic.py @@ -75,6 +75,10 @@ algebraic_late = [ # XXX: Duplicate of nir_lower_pack (('unpack_64_2x32', a), ('vec2', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a))), + + # We don't have S32_TO_F16 on any arch + (('i2f16', 'a@32'), ('f2f16', ('i2f32', a))), + (('u2f16', 'a@32'), ('f2f16', ('u2f32', a))), ] # Handling all combinations of boolean and float sizes for b2f is nontrivial.