aco: only insert fp mode when needed
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35746>
This commit is contained in:
Georg Lehmann 2025-06-30 16:11:42 +02:00 committed by Marge Bot
parent 46c1bd1147
commit d45f375a9d
5 changed files with 19 additions and 7 deletions

View file

@ -157,7 +157,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_ILP))
schedule_ilp(program.get());
insert_fp_mode(program.get());
if (program->needs_fp_mode_insertion)
insert_fp_mode(program.get());
insert_waitcnt(program.get());
insert_NOPs(program.get());

View file

@ -233,6 +233,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
program->next_fp_mode.denorm32 = 0;
program->next_fp_mode.round16_64 = fp_round_ne;
program->next_fp_mode.round32 = fp_round_ne;
program->needs_fp_mode_insertion = false;
}
bool

View file

@ -2142,6 +2142,7 @@ public:
Stage stage;
bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
bool needs_wqm = false; /* there exists a p_wqm instruction */
bool needs_fp_mode_insertion = false; /* insert_fp_mode should be run */
bool has_smem_buffer_or_global_loads = false;
bool has_pops_overlapped_waves_wait = false;
bool has_color_exports = false;

View file

@ -1429,6 +1429,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
return select_program_rt(ctx, shader_count, shaders, args);
if (shader_count >= 2) {
program->needs_fp_mode_insertion = true;
select_program_merged(ctx, shader_count, shaders);
} else {
bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
@ -1437,6 +1438,7 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
/* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
if (ctx.program->info.merged_shader_compiled_separately) {
assert(ctx.program->gfx_level >= GFX9);
program->needs_fp_mode_insertion = true;
if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
check_merged_wave_info = endif_merged_wave_info = true;
} else {

View file

@ -2510,9 +2510,10 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
/* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
* keep value numbering and the scheduler simpler.
/* We emit s_round_mode/s_setreg_imm32 in insert_fp_mode to
* keep value numbering and scheduling simpler.
*/
ctx->program->needs_fp_mode_insertion = true;
if (dst.regClass() == v2b)
bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
else
@ -2600,6 +2601,8 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
}
ctx->program->needs_fp_mode_insertion |= instr->op == nir_op_f2e4m3fn_satfn;
aco_opcode opcode = instr->op == nir_op_f2e4m3fn || instr->op == nir_op_f2e4m3fn_sat
? aco_opcode::v_cvt_pk_fp8_f32
: instr->op == nir_op_f2e4m3fn_satfn ? aco_opcode::p_v_cvt_pk_fp8_f32_ovfl
@ -3215,10 +3218,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
Temp src = get_alu_src(ctx, instr->src[0]);
if (dst.regClass() == v1) {
Temp f16;
if (ctx->block->fp_mode.round16_64 != fp_round_ne)
if (ctx->block->fp_mode.round16_64 != fp_round_ne) {
ctx->program->needs_fp_mode_insertion = true;
f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
else
} else {
f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
}
if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
@ -3254,10 +3259,12 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
} else if (dst.regClass() == s1) {
Temp f16;
if (ctx->block->fp_mode.round16_64 != fp_round_ne)
if (ctx->block->fp_mode.round16_64 != fp_round_ne) {
ctx->program->needs_fp_mode_insertion = true;
f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
else
} else {
f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
}
if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);