diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index c2ee1668c09..cd45895e67c 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -1929,6 +1929,34 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                 return;
         }
 
+        /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
+         * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
+         * scalarizing due to scheduling (equal cost on Valhall). Additionally
+         * if the source is replicated the MKVEC.v2i16 can be optimized out.
+         */
+        case nir_op_u2f16:
+        case nir_op_i2f16: {
+                if (!(src_sz == 32 && comps == 2))
+                        break;
+
+                nir_alu_src *src = &instr->src[0];
+                bi_index idx = bi_src_index(&src->src);
+                bi_index s0 = bi_word(idx, src->swizzle[0]);
+                bi_index s1 = bi_word(idx, src->swizzle[1]);
+
+                bi_index t = (src->swizzle[0] == src->swizzle[1]) ?
+                        bi_half(s0, false) :
+                        bi_mkvec_v2i16(b, bi_half(s0, false),
+                                          bi_half(s1, false));
+
+                if (instr->op == nir_op_u2f16)
+                        bi_v2u16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+                else
+                        bi_v2s16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+
+                return;
+        }
+
         case nir_op_i2i8:
         case nir_op_u2u8:
         {
@@ -3306,8 +3334,6 @@ bi_vectorize_filter(const nir_instr *instr, void *data)
         case nir_op_ushr:
         case nir_op_f2i16:
         case nir_op_f2u16:
-        case nir_op_i2f16:
-        case nir_op_u2f16:
                 return false;
         default:
                 return true;