diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index fc4bd46d622..37ae125e2c8 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -157,7 +157,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options, if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_ILP)) schedule_ilp(program.get()); - insert_fp_mode(program.get()); + if (program->needs_fp_mode_insertion) + insert_fp_mode(program.get()); insert_waitcnt(program.get()); insert_NOPs(program.get()); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 7e14c28ba43..aee4ba133c7 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -233,6 +233,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, program->next_fp_mode.denorm32 = 0; program->next_fp_mode.round16_64 = fp_round_ne; program->next_fp_mode.round32 = fp_round_ne; + program->needs_fp_mode_insertion = false; } bool diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index bc19a46c224..43560848953 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2142,6 +2142,7 @@ public: Stage stage; bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ + bool needs_fp_mode_insertion = false; /* insert_fp_mode should be run */ bool has_smem_buffer_or_global_loads = false; bool has_pops_overlapped_waves_wait = false; bool has_color_exports = false; diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp index ec90116e6a6..7586ce51818 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp @@ -1429,6 +1429,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const return select_program_rt(ctx, shader_count, shaders, args); if (shader_count >= 2) { + program->needs_fp_mode_insertion = true; select_program_merged(ctx, shader_count, shaders); } else { bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false; @@ -1437,6 +1438,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */ if (ctx.program->info.merged_shader_compiled_separately) { assert(ctx.program->gfx_level >= GFX9); + program->needs_fp_mode_insertion = true; if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) { check_merged_wave_info = endif_merged_wave_info = true; } else { diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp index e34988f46ae..46922ab55b1 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_alu.cpp @@ -2510,9 +2510,10 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } Temp src = get_alu_src(ctx, instr->src[0]); if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) { - /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to - * keep value numbering and the scheduler simpler. + /* We emit s_round_mode/s_setreg_imm32 in insert_fp_mode to + * keep value numbering and scheduling simpler. */ + ctx->program->needs_fp_mode_insertion = true; if (dst.regClass() == v2b) bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src); else @@ -2600,6 +2601,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } } + ctx->program->needs_fp_mode_insertion |= instr->op == nir_op_f2e4m3fn_satfn; + aco_opcode opcode = instr->op == nir_op_f2e4m3fn || instr->op == nir_op_f2e4m3fn_sat ? aco_opcode::v_cvt_pk_fp8_f32 : instr->op == nir_op_f2e4m3fn_satfn ? aco_opcode::p_v_cvt_pk_fp8_f32_ovfl @@ -3215,10 +3218,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) Temp src = get_alu_src(ctx, instr->src[0]); if (dst.regClass() == v1) { Temp f16; - if (ctx->block->fp_mode.round16_64 != fp_round_ne) + if (ctx->block->fp_mode.round16_64 != fp_round_ne) { + ctx->program->needs_fp_mode_insertion = true; f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src); - else + } else { f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src); + } if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) { bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16); @@ -3254,10 +3259,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } } else if (dst.regClass() == s1) { Temp f16; - if (ctx->block->fp_mode.round16_64 != fp_round_ne) + if (ctx->block->fp_mode.round16_64 != fp_round_ne) { + ctx->program->needs_fp_mode_insertion = true; f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src); - else + } else { f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src); + } if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) { bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);