aco: Add preload_preserved pseudo instruction

These are helper instructions for the spill_preserved pass to insert
reloads for registers that are preserved by the ABI, yet
clobbered by the callee shader.

There is one p_reload_preserved instruction at the end of each block.
This allows us to insert reloads early, to alleviate the high latency of
scratch reloads.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37381>
This commit is contained in:
Natalie Vock 2025-02-17 18:42:49 +01:00 committed by Marge Bot
parent 9dbf49de2c
commit f0c613765c
12 changed files with 39 additions and 22 deletions

View file

@ -1998,7 +1998,8 @@ is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
{
if (instr->definitions.empty() || instr->isBranch() || instr->isCall() ||
instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch ||
instr->opcode == aco_opcode::p_dual_src_export_gfx11)
instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
instr->opcode == aco_opcode::p_reload_preserved)
return false;
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),

View file

@ -262,7 +262,8 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
insn->operands[5].setLateKill(true); /* we re-use the destination reg in the middle */
} else if (insn->opcode == aco_opcode::v_interp_p1_f32 && ctx.program->dev.has_16bank_lds) {
insn->operands[0].setLateKill(true);
} else if (insn->opcode == aco_opcode::p_init_scratch) {
} else if (insn->opcode == aco_opcode::p_init_scratch ||
insn->opcode == aco_opcode::p_reload_preserved) {
insn->operands.back().setLateKill(true);
} else if (instr_info.classes[(int)insn->opcode] == instr_class::wmma) {
insn->operands[0].setLateKill(true);

View file

@ -356,6 +356,8 @@ insn("p_unit_test")
insn("p_callee_stack_ptr")
insn("p_reload_preserved")
insn("p_create_vector")
insn("p_extract_vector")
insn("p_split_vector")

View file

@ -316,8 +316,8 @@ can_eliminate(aco_ptr<Instruction>& instr)
if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
instr->opcode == aco_opcode::p_linear_phi ||
instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi ||
instr->definitions[0].isNoCSE())
instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi || instr->definitions[0].isNoCSE() ||
instr->opcode == aco_opcode::p_reload_preserved)
return false;
return true;

View file

@ -488,7 +488,8 @@ is_reorderable(const Instruction* instr)
instr->opcode != aco_opcode::p_end_with_regs && instr->opcode != aco_opcode::s_nop &&
instr->opcode != aco_opcode::s_sleep && instr->opcode != aco_opcode::s_trap &&
instr->opcode != aco_opcode::p_call && instr->opcode != aco_opcode::p_logical_start &&
instr->opcode != aco_opcode::p_logical_end;
instr->opcode != aco_opcode::p_logical_end &&
instr->opcode != aco_opcode::p_reload_preserved;
}
struct memory_event_set {

View file

@ -251,7 +251,7 @@ void end_empty_exec_skip(isel_context* ctx);
/* aco_isel_helpers.cpp */
void append_logical_start(Block* b);
void append_logical_end(Block* b);
void append_logical_end(isel_context* ctx, bool append_reload_preserved = true);
Temp get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit);
Temp bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2));
Temp bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1));

View file

@ -37,7 +37,7 @@ emit_loop_jump(isel_context* ctx, bool is_break)
{
Builder bld(ctx->program, ctx->block);
Block* logical_target;
append_logical_end(ctx->block);
append_logical_end(ctx);
unsigned idx = ctx->block->index;
if (is_break) {
@ -119,7 +119,7 @@ update_exec_info(isel_context* ctx)
void
begin_loop(isel_context* ctx, loop_context* lc)
{
append_logical_end(ctx->block);
append_logical_end(ctx);
ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
Builder bld(ctx->program, ctx->block);
bld.branch(aco_opcode::p_branch);
@ -158,7 +158,7 @@ end_loop(isel_context* ctx, loop_context* lc)
if (!ctx->cf_info.has_branch) {
unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
Builder bld(ctx->program, ctx->block);
append_logical_end(ctx->block);
append_logical_end(ctx);
ctx->block->kind |= (block_kind_continue | block_kind_uniform);
if (!ctx->cf_info.has_divergent_branch)
@ -201,7 +201,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
ic->cond = cond;
append_logical_end(ctx->block);
append_logical_end(ctx);
ctx->block->kind |= block_kind_uniform;
aco_ptr<Instruction> branch;
@ -239,7 +239,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else)
Block* BB_then = ctx->block;
if (!ctx->cf_info.has_branch) {
append_logical_end(BB_then);
append_logical_end(ctx);
/* branch from then block to endif block */
aco_ptr<Instruction> branch;
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
@ -272,7 +272,7 @@ end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else)
if (!ctx->cf_info.has_branch) {
if (logical_else)
append_logical_end(BB_else);
append_logical_end(ctx);
/* branch from then block to endif block */
aco_ptr<Instruction> branch;
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
@ -306,7 +306,7 @@ void
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
nir_selection_control sel_ctrl)
{
append_logical_end(ctx->block);
append_logical_end(ctx);
ctx->block->kind |= block_kind_branch;
/* branch to linear then block */
@ -346,7 +346,7 @@ void
begin_divergent_if_else(isel_context* ctx, if_context* ic, nir_selection_control sel_ctrl)
{
Block* BB_then_logical = ctx->block;
append_logical_end(BB_then_logical);
append_logical_end(ctx);
/* branch from logical then block to invert block */
aco_ptr<Instruction> branch;
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
@ -398,7 +398,7 @@ void
end_divergent_if(isel_context* ctx, if_context* ic)
{
Block* BB_else_logical = ctx->block;
append_logical_end(BB_else_logical);
append_logical_end(ctx);
/* branch from logical else block to endif block */
aco_ptr<Instruction> branch;

View file

@ -41,9 +41,21 @@ append_logical_start(Block* b)
}
void
append_logical_end(Block* b)
append_logical_end(isel_context* ctx, bool append_reload_preserved)
{
Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
Builder bld(ctx->program, ctx->block);
if (append_reload_preserved && ctx->program->is_callee) {
Operand stack_ptr_op;
if (ctx->program->gfx_level >= GFX9)
stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp());
else
stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, -1u, false));
bld.pseudo(aco_opcode::p_reload_preserved, bld.def(s1), bld.def(bld.lm), bld.def(s1, scc),
stack_ptr_op);
}
bld.pseudo(aco_opcode::p_logical_end);
}
Temp

View file

@ -1200,7 +1200,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
append_logical_start(ctx.block);
split_arguments(&ctx, startpgm);
visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
append_logical_end(ctx.block);
append_logical_end(&ctx);
ctx.block->kind |= block_kind_uniform;
/* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
@ -1359,7 +1359,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
if (need_endpgm) {
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
append_logical_end(&ctx);
ctx.block->kind |= block_kind_uniform;
if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||

View file

@ -470,7 +470,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
append_logical_end(&ctx);
ctx.block->kind |= block_kind_export_end;
bld.reset(ctx.block);
bld.sopp(aco_opcode::s_endpgm);

View file

@ -304,7 +304,7 @@ select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
append_logical_end(&ctx);
build_end_with_regs(&ctx, regs);

View file

@ -499,7 +499,7 @@ select_trap_handler_shader(Program* program, ac_shader_config* config,
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
append_logical_end(&ctx);
ctx.block->kind |= block_kind_uniform;
bld.sopp(aco_opcode::s_endpgm);