diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c
index e2772d2c755..effaa263fdf 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir.c
+++ b/src/panfrost/compiler/bifrost/bifrost_nir.c
@@ -949,6 +949,7 @@ bifrost_postprocess_nir(nir_shader *nir,
    NIR_PASS(_, nir, pan_nir_lower_tex, gpu_id);
    NIR_PASS(_, nir, pan_nir_lower_image, gpu_id);
 
+   NIR_PASS(_, nir, pan_nir_fuse_io_cvt, gpu_id, &info->varyings.formats);
    /* Our OpenCL compiler (src/panfrost/clc/pan_compile.c) has a very weird and
     * suboptimal optimization pipeline that results in a lot of unoptimized
     * memcpys and sparse scratch space.  That code is still being used for
diff --git a/src/panfrost/compiler/meson.build b/src/panfrost/compiler/meson.build
index c13a777ebc0..830aabb062c 100644
--- a/src/panfrost/compiler/meson.build
+++ b/src/panfrost/compiler/meson.build
@@ -5,6 +5,7 @@ libpanfrost_compiler_files = files(
   'pan_compiler.c',
   'pan_compiler.h',
   'pan_nir_collect_varyings.c',
+  'pan_nir_fuse_io_cvt.c',
   'pan_nir_lower_bool_to_bitsize.c',
   'pan_nir_lower_var_special.c',
   'pan_nir_lower_framebuffer.c',
diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h
index ac84b41bb6e..bec99d2a8ee 100644
--- a/src/panfrost/compiler/pan_nir.h
+++ b/src/panfrost/compiler/pan_nir.h
@@ -262,4 +262,7 @@ uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);
 bool pan_nir_resize_varying_io(nir_shader *nir,
                                const struct pan_varying_layout *varying_layout);
 
+bool pan_nir_fuse_io_cvt(nir_shader *nir, uint64_t gpu_id,
+                         struct pan_varying_layout *layout);
+
 #endif /* __PAN_NIR_H__ */
diff --git a/src/panfrost/compiler/pan_nir_fuse_io_cvt.c b/src/panfrost/compiler/pan_nir_fuse_io_cvt.c
new file mode 100644
index 00000000000..c2d91a5846c
--- /dev/null
+++ b/src/panfrost/compiler/pan_nir_fuse_io_cvt.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2026 Amazon.com, Inc. or its affiliates.
+ * SPDX-License-Identifier: MIT
+ */
+#include "pan_nir.h"
+
+static unsigned
+nir_src_float_cvt_bits(nir_src *use, bool *is_mp)
+{
+   nir_instr *parent = nir_src_use_instr(use);
+
+   if (parent->type != nir_instr_type_alu)
+      return 0;
+
+   nir_alu_instr *alu = nir_instr_as_alu(parent);
+
+   switch (alu->op) {
+      case nir_op_f2f16:
+         return 16;
+      case nir_op_f2fmp:
+         *is_mp |= true;
+         return 16;
+      case nir_op_f2f32:
+         return 32;
+      default:
+         return 0;
+   }
+}
+
+static bool
+op_supports_cvt_fusion(nir_intrinsic_instr *instr, uint64_t gpu_id)
+{
+   /* We might also convert LD_CVT but I haven't seen any case where it's
+    * useful, maybe enable it when we have a case to check it on.
+    */
+   switch (instr->intrinsic) {
+      case nir_intrinsic_load_var_pan:
+      case nir_intrinsic_load_var_buf_pan:
+         /* LD_VAR[_BUF] performs conversion BEFORE interpolation, we cannot
+          * just change the interpolation semantics at highp.  mediump on the
+          * other hand lets us juggle between 32 and 16 bits freely.
+          */
+         return nir_intrinsic_io_semantics(instr).medium_precision;
+      case nir_intrinsic_load_var_flat_pan:
+         return true;
+      case nir_intrinsic_load_var_buf_flat_pan:
+         /* TODO: v14 can even fuse flat buf conversions */
+         return false;
+      default:
+         return false;
+   }
+}
+
+struct fuse_ctx {
+   uint64_t gpu_id;
+   struct pan_varying_layout *layout;
+};
+
+static bool
+fuse_io_instr(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   const struct fuse_ctx *ctx = data;
+
+   if (!op_supports_cvt_fusion(intr, ctx->gpu_id))
+      return false;
+
+   unsigned orig_bit_size = intr->def.bit_size;
+   assert(orig_bit_size == 32 || orig_bit_size == 16);
+   unsigned converted_bit_size = orig_bit_size == 32 ? 16 : 32;
+   bool is_mp = false;
+
+   /* Check if all usages are conversions */
+   nir_foreach_use_including_if(src, &intr->def) {
+      if (nir_src_is_if(src) ||
+          nir_src_float_cvt_bits(src, &is_mp) != converted_bit_size)
+         return false;
+   }
+
+   /* If they are, the load is always followed by conversion and we thus can
+    * fuse the cvt into the load.
+    */
+   intr->def.bit_size = converted_bit_size;
+   /* Update the dest_type.  This will not change the in-memory representation
+    * of _buf intrinsics as those are stored in the src_type.
+    */
+   if (nir_intrinsic_has_dest_type(intr)) {
+      nir_alu_type dest_type = nir_intrinsic_dest_type(intr);
+      nir_alu_type base_type = nir_get_glsl_base_type_for_nir_type(dest_type);
+      nir_intrinsic_set_dest_type(intr, nir_type_float | converted_bit_size);
+
+      if (base_type != nir_type_float) {
+         const nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         /* Right now we have int descriptors, but the loaded value is always
+          * used as a flot, no harm in just "promoting" it to float.  The cast
+          * is to discard the const modifier, this is safe.
+          */
+         struct pan_varying_slot *slot = (struct pan_varying_slot *)
+            pan_varying_layout_find_slot(ctx->layout, sem.location);
+         slot->alu_type = nir_alu_type_get_type_size(slot->alu_type) |
+                          nir_type_float;
+      }
+   }
+
+   /* We don't remove conversions, nir_opt_algebraic will fold f2f16 a@16
+    * and f2f32 a@32 automatically, everything but f2fmp of course.
+    */
+   if (is_mp) {
+      b->cursor = nir_after_instr(&intr->instr);
+      nir_def *up_cvt = nir_f2f32(b, &intr->def);
+      nir_def_rewrite_uses_after(&intr->def, up_cvt);
+   }
+
+   return true;
+}
+
+bool
+pan_nir_fuse_io_cvt(nir_shader *nir, uint64_t gpu_id,
+                    struct pan_varying_layout *layout)
+{
+   struct fuse_ctx ctx = {
+      .gpu_id = gpu_id,
+      .layout = layout,
+   };
+   return nir_shader_intrinsics_pass(nir, fuse_io_instr,
+                                     nir_metadata_control_flow, (void *)&ctx);
+}