From f0c613765cd0603a0fa450c818556b81c6821222 Mon Sep 17 00:00:00 2001 From: Natalie Vock Date: Mon, 17 Feb 2025 18:42:49 +0100 Subject: [PATCH] aco: Add preload_preserved pseudo instruction These are helper instructions for the spill_preserved pass to insert reloads for registers that are preserved by the ABI, yet clobbered by the callee shader. There is one p_reload_preserved instruction at the end of each block. This allows us to insert reloads early, to alleviate the high latency of scratch reloads. Part-of: --- src/amd/compiler/aco_ir.h | 3 ++- src/amd/compiler/aco_live_var_analysis.cpp | 3 ++- src/amd/compiler/aco_opcodes.py | 2 ++ src/amd/compiler/aco_opt_value_numbering.cpp | 4 ++-- src/amd/compiler/aco_scheduler.cpp | 3 ++- .../aco_instruction_selection.h | 2 +- .../instruction_selection/aco_isel_cfg.cpp | 18 +++++++++--------- .../instruction_selection/aco_isel_helpers.cpp | 16 ++++++++++++++-- .../instruction_selection/aco_select_nir.cpp | 4 ++-- .../aco_select_ps_epilog.cpp | 2 +- .../aco_select_ps_prolog.cpp | 2 +- .../aco_select_trap_handler.cpp | 2 +- 12 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 3edeee7f7c7..871c96a6c42 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1998,7 +1998,8 @@ is_dead(const std::vector& uses, const Instruction* instr) { if (instr->definitions.empty() || instr->isBranch() || instr->isCall() || instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || - instr->opcode == aco_opcode::p_dual_src_export_gfx11) + instr->opcode == aco_opcode::p_dual_src_export_gfx11 || + instr->opcode == aco_opcode::p_reload_preserved) return false; if (std::any_of(instr->definitions.begin(), instr->definitions.end(), diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 594db7dc5f3..84f17f8f6c7 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -262,7 +262,8 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */ } else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) { insn->operands[0].setLateKill(true); - } else if (insn->opcode == aco_opcode::p_init_scratch) { + } else if (insn->opcode == aco_opcode::p_init_scratch || + insn->opcode == aco_opcode::p_reload_preserved) { insn->operands.back().setLateKill(true); } else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) { insn->operands[0].setLateKill(true); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index ddb86e641c2..dfb457c3eaf 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -356,6 +356,8 @@ insn("p_unit_test") insn("p_callee_stack_ptr") +insn("p_reload_preserved") + insn("p_create_vector") insn("p_extract_vector") insn("p_split_vector") diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index df57a7c4619..dfd8c1e3891 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -316,8 +316,8 @@ can_eliminate(aco_ptr& instr) if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id || - instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi || - instr->definitions[0].isNoCSE()) + instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi || instr->definitions[0].isNoCSE() || + instr->opcode == aco_opcode::p_reload_preserved) return false; return true; diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index afbb7278363..e4816917997 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -488,7 +488,8 @@ is_reorderable(const Instruction* instr) instr->opcode != aco_opcode::p_end_with_regs && instr->opcode != aco_opcode::s_nop && instr->opcode != aco_opcode::s_sleep && instr->opcode != aco_opcode::s_trap && instr->opcode != aco_opcode::p_call && instr->opcode != aco_opcode::p_logical_start && - instr->opcode != aco_opcode::p_logical_end; + instr->opcode != aco_opcode::p_logical_end && + instr->opcode != aco_opcode::p_reload_preserved; } struct memory_event_set { diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index 29e9b631585..a3444824156 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -251,7 +251,7 @@ void end_empty_exec_skip(isel_context* ctx); /* aco_isel_helpers.cpp */ void append_logical_start(Block* b); -void append_logical_end(Block* b); +void append_logical_end(isel_context* ctx, bool append_reload_preserved = true); Temp get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit); Temp bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2)); Temp bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1)); diff --git a/src/amd/compiler/instruction_selection/aco_isel_cfg.cpp b/src/amd/compiler/instruction_selection/aco_isel_cfg.cpp index a3e3f642aca..65ae6fc98c8 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_cfg.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_cfg.cpp @@ -37,7 +37,7 @@ emit_loop_jump(isel_context* ctx, bool is_break) { Builder bld(ctx->program, ctx->block); Block* logical_target; - append_logical_end(ctx->block); + append_logical_end(ctx); unsigned idx = ctx->block->index; if (is_break) { @@ -119,7 +119,7 @@ update_exec_info(isel_context* ctx) void begin_loop(isel_context* ctx, loop_context* lc) { - append_logical_end(ctx->block); + append_logical_end(ctx); ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; Builder bld(ctx->program, ctx->block); bld.branch(aco_opcode::p_branch); @@ -158,7 +158,7 @@ end_loop(isel_context* ctx, loop_context* lc) if (!ctx->cf_info.has_branch) { unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; Builder bld(ctx->program, ctx->block); - append_logical_end(ctx->block); + append_logical_end(ctx); ctx->block->kind |= (block_kind_continue | block_kind_uniform); if (!ctx->cf_info.has_divergent_branch) @@ -201,7 +201,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) ic->cond = cond; - append_logical_end(ctx->block); + append_logical_end(ctx); ctx->block->kind |= block_kind_uniform; aco_ptr branch; @@ -239,7 +239,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else) Block* BB_then = ctx->block; if (!ctx->cf_info.has_branch) { - append_logical_end(BB_then); + append_logical_end(ctx); /* branch from then block to endif block */ aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); @@ -272,7 +272,7 @@ end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else) if (!ctx->cf_info.has_branch) { if (logical_else) - append_logical_end(BB_else); + append_logical_end(ctx); /* branch from then block to endif block */ aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); @@ -306,7 +306,7 @@ void begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, nir_selection_control sel_ctrl) { - append_logical_end(ctx->block); + append_logical_end(ctx); ctx->block->kind |= block_kind_branch; /* branch to linear then block */ @@ -346,7 +346,7 @@ void begin_divergent_if_else(isel_context* ctx, if_context* ic, nir_selection_control sel_ctrl) { Block* BB_then_logical = ctx->block; - append_logical_end(BB_then_logical); + append_logical_end(ctx); /* branch from logical then block to invert block */ aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0)); @@ -398,7 +398,7 @@ void end_divergent_if(isel_context* ctx, if_context* ic) { Block* BB_else_logical = ctx->block; - append_logical_end(BB_else_logical); + append_logical_end(ctx); /* branch from logical else block to endif block */ aco_ptr branch; diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp index 2139274c875..e8cc029766f 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp @@ -41,9 +41,21 @@ append_logical_start(Block* b) } void -append_logical_end(Block* b) +append_logical_end(isel_context* ctx, bool append_reload_preserved) { - Builder(NULL, b).pseudo(aco_opcode::p_logical_end); + Builder bld(ctx->program, ctx->block); + + if (append_reload_preserved && ctx->program->is_callee) { + Operand stack_ptr_op; + if (ctx->program->gfx_level >= GFX9) + stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp()); + else + stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false)); + bld.pseudo(aco_opcode::p_reload_preserved, bld.def(s1), bld.def(bld.lm), bld.def(s1, scc), + stack_ptr_op); + } + + bld.pseudo(aco_opcode::p_logical_end); } Temp diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp index 50184e2082c..6a95c3dc6e1 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp @@ -1200,7 +1200,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c append_logical_start(ctx.block); split_arguments(&ctx, startpgm); visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body); - append_logical_end(ctx.block); + append_logical_end(&ctx); ctx.block->kind |= block_kind_uniform; /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen @@ -1359,7 +1359,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons if (need_endpgm) { program->config->float_mode = program->blocks[0].fp_mode.val; - append_logical_end(ctx.block); + append_logical_end(&ctx); ctx.block->kind |= block_kind_uniform; if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) || diff --git a/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp b/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp index 006f2a42eae..bf83b8dd961 100644 --- a/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp @@ -470,7 +470,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, program->config->float_mode = program->blocks[0].fp_mode.val; - append_logical_end(ctx.block); + append_logical_end(&ctx); ctx.block->kind |= block_kind_export_end; bld.reset(ctx.block); bld.sopp(aco_opcode::s_endpgm); diff --git a/src/amd/compiler/instruction_selection/aco_select_ps_prolog.cpp b/src/amd/compiler/instruction_selection/aco_select_ps_prolog.cpp index 5765d37450f..927df3c8eca 100644 --- a/src/amd/compiler/instruction_selection/aco_select_ps_prolog.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_ps_prolog.cpp @@ -304,7 +304,7 @@ select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config, program->config->float_mode = program->blocks[0].fp_mode.val; - append_logical_end(ctx.block); + append_logical_end(&ctx); build_end_with_regs(&ctx, regs); diff --git a/src/amd/compiler/instruction_selection/aco_select_trap_handler.cpp b/src/amd/compiler/instruction_selection/aco_select_trap_handler.cpp index 75af8a01e33..e0811c5625c 100644 --- a/src/amd/compiler/instruction_selection/aco_select_trap_handler.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_trap_handler.cpp @@ -499,7 +499,7 @@ select_trap_handler_shader(Program* program, ac_shader_config* config, program->config->float_mode = program->blocks[0].fp_mode.val; - append_logical_end(ctx.block); + append_logical_end(&ctx); ctx.block->kind |= block_kind_uniform; bld.sopp(aco_opcode::s_endpgm);