diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index fc4bd46d622..37ae125e2c8 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -157,7 +157,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
    if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_ILP))
       schedule_ilp(program.get());
 
-   insert_fp_mode(program.get());
+   if (program->needs_fp_mode_insertion)
+      insert_fp_mode(program.get());
 
    insert_waitcnt(program.get());
    insert_NOPs(program.get());
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 7e14c28ba43..aee4ba133c7 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -233,6 +233,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
    program->next_fp_mode.denorm32 = 0;
    program->next_fp_mode.round16_64 = fp_round_ne;
    program->next_fp_mode.round32 = fp_round_ne;
+   program->needs_fp_mode_insertion = false;
 }
 
 bool
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index bc19a46c224..43560848953 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2142,6 +2142,7 @@ public:
    Stage stage;
    bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
    bool needs_wqm = false;   /* there exists a p_wqm instruction */
+   bool needs_fp_mode_insertion = false; /* insert_fp_mode should be run */
    bool has_smem_buffer_or_global_loads = false;
    bool has_pops_overlapped_waves_wait = false;
    bool has_color_exports = false;
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
index ec90116e6a6..7586ce51818 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@@ -1429,6 +1429,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
       return select_program_rt(ctx, shader_count, shaders, args);
 
    if (shader_count >= 2) {
+      program->needs_fp_mode_insertion = true;
       select_program_merged(ctx, shader_count, shaders);
    } else {
       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
@@ -1437,6 +1438,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
       if (ctx.program->info.merged_shader_compiled_separately) {
          assert(ctx.program->gfx_level >= GFX9);
+         program->needs_fp_mode_insertion = true;
          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
             check_merged_wave_info = endif_merged_wave_info = true;
          } else {
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
index e34988f46ae..46922ab55b1 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp
@@ -2510,9 +2510,10 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
-         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
-          * keep value numbering and the scheduler simpler.
+         /* We emit s_round_mode/s_setreg_imm32 in insert_fp_mode to
+          * keep value numbering and scheduling simpler.
           */
+         ctx->program->needs_fp_mode_insertion = true;
          if (dst.regClass() == v2b)
             bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
          else
@@ -2600,6 +2601,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
          }
       }
 
+      ctx->program->needs_fp_mode_insertion |= instr->op == nir_op_f2e4m3fn_satfn;
+
       aco_opcode opcode = instr->op == nir_op_f2e4m3fn || instr->op == nir_op_f2e4m3fn_sat
                              ? aco_opcode::v_cvt_pk_fp8_f32
                           : instr->op == nir_op_f2e4m3fn_satfn ? aco_opcode::p_v_cvt_pk_fp8_f32_ovfl
@@ -3215,10 +3218,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v1) {
          Temp f16;
-         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
+         if (ctx->block->fp_mode.round16_64 != fp_round_ne) {
+            ctx->program->needs_fp_mode_insertion = true;
             f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
-         else
+         } else {
             f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
+         }
 
          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
             bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
@@ -3254,10 +3259,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
          }
       } else if (dst.regClass() == s1) {
          Temp f16;
-         if (ctx->block->fp_mode.round16_64 != fp_round_ne)
+         if (ctx->block->fp_mode.round16_64 != fp_round_ne) {
+            ctx->program->needs_fp_mode_insertion = true;
             f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
-         else
+         } else {
             f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
+         }
 
          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
             bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);