From 22ffe720222d39f4fb479ed19fda132b102a601a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Wed, 20 Nov 2024 14:32:23 +0100 Subject: [PATCH] aco: move branch lowering optimization into separate file 'aco_lower_branches.cpp' No fossil changes. Part-of: --- src/amd/compiler/aco_interface.cpp | 2 + src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_lower_branches.cpp | 178 +++++++++++++++++++++ src/amd/compiler/aco_lower_to_hw_instr.cpp | 134 ---------------- src/amd/compiler/meson.build | 1 + 5 files changed, 182 insertions(+), 134 deletions(-) create mode 100644 src/amd/compiler/aco_lower_branches.cpp diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 57434bd92be..528f5564cca 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -178,6 +178,7 @@ aco_postprocess_shader(const struct aco_compiler_options* options, /* Lower to HW Instructions */ lower_to_hw_instr(program.get()); + lower_branches(program.get()); validate(program.get()); if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_VOPD)) @@ -422,6 +423,7 @@ aco_compile_trap_handler(const struct aco_compiler_options* options, validate(program.get()); lower_to_hw_instr(program.get()); + lower_branches(program.get()); validate(program.get()); insert_waitcnt(program.get()); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 4a8dd1582f1..4b5a30893c5 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2252,6 +2252,7 @@ void insert_exec_mask(Program* program); void value_numbering(Program* program); void optimize(Program* program); void optimize_postRA(Program* program); +void lower_branches(Program* program); void setup_reduce_temp(Program* program); void lower_to_cssa(Program* program); void register_allocation(Program* program, ra_test_policy = {}); diff --git a/src/amd/compiler/aco_lower_branches.cpp b/src/amd/compiler/aco_lower_branches.cpp new file mode 100644 index 00000000000..b58b627fc24 --- /dev/null +++ b/src/amd/compiler/aco_lower_branches.cpp @@ -0,0 +1,178 @@ +/* + * Copyright © 2024 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "aco_builder.h" +#include "aco_ir.h" + +namespace aco { +namespace { + +struct branch_ctx { + Program* program; + + branch_ctx(Program* program_) : program(program_) {} +}; + +/** + * Check if the branch instruction can be removed: + * This is beneficial when executing the next block with an empty exec mask + * is faster than the branch instruction itself. + * + * Override this judgement when: + * - The application prefers to remove control flow + * - The compiler stack knows that it's a divergent branch never taken + */ +bool +can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch) +{ + const uint32_t target = branch->target[0]; + const bool uniform_branch = + !((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) && + branch->operands[0].physReg() == exec); + + if (branch->never_taken) { + assert(!uniform_branch); + return true; + } + + /* Cannot remove back-edges. */ + if (block.index >= target) + return false; + + const bool prefer_remove = branch->rarely_taken; + unsigned num_scalar = 0; + unsigned num_vector = 0; + + /* Check the instructions between branch and target */ + for (unsigned i = block.index + 1; i < target; i++) { + /* Uniform conditional branches must not be ignored if they + * are about to jump over actual instructions */ + if (uniform_branch && !ctx.program->blocks[i].instructions.empty()) + return false; + + for (aco_ptr& instr : ctx.program->blocks[i].instructions) { + if (instr->isSOPP()) { + /* Discard early exits and loop breaks and continues should work fine with + * an empty exec mask. + */ + if (instr->opcode == aco_opcode::s_cbranch_scc0 || + instr->opcode == aco_opcode::s_cbranch_scc1) { + bool is_break_continue = + ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue); + bool discard_early_exit = + ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit; + if (is_break_continue || discard_early_exit) + continue; + } + return false; + } else if (instr->isSALU()) { + num_scalar++; + } else if (instr->isVALU() || instr->isVINTRP()) { + if (instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) { + /* writelane ignores exec, writing inactive lanes results in UB. */ + return false; + } + num_vector++; + /* VALU which writes SGPRs are always executed on GFX10+ */ + if (ctx.program->gfx_level >= GFX10) { + for (Definition& def : instr->definitions) { + if (def.regClass().type() == RegType::sgpr) + num_scalar++; + } + } + } else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) { + /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs), + * SMEM might be an invalid access, and barriers are probably expensive. */ + return false; + } else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) { + // TODO: GFX6-9 can use vskip + if (!prefer_remove) + return false; + } else if (instr->opcode != aco_opcode::p_debug_info) { + assert(false && "Pseudo instructions should be lowered by this point."); + return false; + } + + if (!prefer_remove) { + /* Under these conditions, we shouldn't remove the branch. + * Don't care about the estimated cycles when the shader prefers flattening. + */ + unsigned est_cycles; + if (ctx.program->gfx_level >= GFX10) + est_cycles = num_scalar * 2 + num_vector; + else + est_cycles = num_scalar * 4 + num_vector * 4; + + if (est_cycles > 16) + return false; + } + } + } + + return true; +} + +void +lower_branch_instruction(branch_ctx& ctx, Block& block) +{ + if (block.instructions.empty() || !block.instructions.back()->isBranch()) + return; + + aco_ptr branch = std::move(block.instructions.back()); + const uint32_t target = branch->branch().target[0]; + block.instructions.pop_back(); + + if (can_remove_branch(ctx, block, &branch->branch())) + return; + + /* emit branch instruction */ + Builder bld(ctx.program, &block.instructions); + switch (branch->opcode) { + case aco_opcode::p_branch: + assert(block.linear_succs[0] == target); + bld.sopp(aco_opcode::s_branch, target); + break; + case aco_opcode::p_cbranch_nz: + assert(block.linear_succs[1] == target); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execnz, target); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccnz, target); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc1, target); + } + break; + case aco_opcode::p_cbranch_z: + assert(block.linear_succs[1] == target); + if (branch->operands[0].physReg() == exec) + bld.sopp(aco_opcode::s_cbranch_execz, target); + else if (branch->operands[0].physReg() == vcc) + bld.sopp(aco_opcode::s_cbranch_vccz, target); + else { + assert(branch->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, target); + } + break; + default: unreachable("Unknown Pseudo branch instruction!"); + } +} + +} /* end namespace */ + +void +lower_branches(Program* program) +{ + branch_ctx ctx(program); + + for (int i = program->blocks.size() - 1; i >= 0; i--) { + Block& block = program->blocks[i]; + lower_branch_instruction(ctx, block); + } +} + +} // namespace aco diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 369b9b13d6e..5ec32ab004c 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2843,140 +2843,6 @@ lower_to_hw_instr(Program* program) } default: break; } - } else if (instr->isBranch()) { - Pseudo_branch_instruction* branch = &instr->branch(); - const uint32_t target = branch->target[0]; - const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z || - branch->opcode == aco_opcode::p_cbranch_nz) && - branch->operands[0].physReg() == exec); - - if (branch->never_taken) { - assert(!uniform_branch); - continue; - } - - /* Check if the branch instruction can be removed. - * This is beneficial when executing the next block with an empty exec mask - * is faster than the branch instruction itself. - * - * Override this judgement when: - * - The application prefers to remove control flow - * - The compiler stack knows that it's a divergent branch always taken - */ - const bool prefer_remove = branch->rarely_taken; - bool can_remove = block->index < target; - unsigned num_scalar = 0; - unsigned num_vector = 0; - - /* Check the instructions between branch and target */ - for (unsigned i = block->index + 1; i < branch->target[0]; i++) { - /* Uniform conditional branches must not be ignored if they - * are about to jump over actual instructions */ - if (uniform_branch && !program->blocks[i].instructions.empty()) - can_remove = false; - - if (!can_remove) - break; - - for (aco_ptr& inst : program->blocks[i].instructions) { - if (inst->isSOPP()) { - if (instr_info.classes[(int)inst->opcode] == instr_class::branch) { - /* Discard early exits and loop breaks and continues should work fine with - * an empty exec mask. - */ - bool is_break_continue = - program->blocks[i].kind & (block_kind_break | block_kind_continue); - bool discard_early_exit = - program->blocks[inst->salu().imm].kind & block_kind_discard_early_exit; - if ((inst->opcode != aco_opcode::s_cbranch_scc0 && - inst->opcode != aco_opcode::s_cbranch_scc1) || - (!discard_early_exit && !is_break_continue)) - can_remove = false; - } else { - can_remove = false; - } - } else if (inst->isSALU()) { - num_scalar++; - } else if (inst->isVALU() || inst->isVINTRP()) { - if (inst->opcode == aco_opcode::v_writelane_b32 || - inst->opcode == aco_opcode::v_writelane_b32_e64) { - /* writelane ignores exec, writing inactive lanes results in UB. */ - can_remove = false; - } - num_vector++; - /* VALU which writes SGPRs are always executed on GFX10+ */ - if (ctx.program->gfx_level >= GFX10) { - for (Definition& def : inst->definitions) { - if (def.regClass().type() == RegType::sgpr) - num_scalar++; - } - } - } else if (inst->isEXP() || inst->isSMEM() || inst->isBarrier()) { - /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs), - * SMEM might be an invalid access, and barriers are probably expensive. */ - can_remove = false; - } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() || - inst->isLDSDIR()) { - // TODO: GFX6-9 can use vskip - can_remove = prefer_remove; - } else if (inst->opcode != aco_opcode::p_debug_info) { - can_remove = false; - assert(false && "Pseudo instructions should be lowered by this point."); - } - - if (!prefer_remove) { - /* Under these conditions, we shouldn't remove the branch. - * Don't care about the estimated cycles when the shader prefers flattening. - */ - unsigned est_cycles; - if (ctx.program->gfx_level >= GFX10) - est_cycles = num_scalar * 2 + num_vector; - else - est_cycles = num_scalar * 4 + num_vector * 4; - - if (est_cycles > 16) - can_remove = false; - } - - if (!can_remove) - break; - } - } - - if (can_remove) - continue; - - /* emit branch instruction */ - switch (instr->opcode) { - case aco_opcode::p_branch: - assert(block->linear_succs[0] == target); - bld.sopp(aco_opcode::s_branch, target); - break; - case aco_opcode::p_cbranch_nz: - assert(block->linear_succs[1] == target); - if (branch->operands[0].physReg() == exec) - bld.sopp(aco_opcode::s_cbranch_execnz, target); - else if (branch->operands[0].physReg() == vcc) - bld.sopp(aco_opcode::s_cbranch_vccnz, target); - else { - assert(branch->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc1, target); - } - break; - case aco_opcode::p_cbranch_z: - assert(block->linear_succs[1] == target); - if (branch->operands[0].physReg() == exec) - bld.sopp(aco_opcode::s_cbranch_execz, target); - else if (branch->operands[0].physReg() == vcc) - bld.sopp(aco_opcode::s_cbranch_vccz, target); - else { - assert(branch->operands[0].physReg() == scc); - bld.sopp(aco_opcode::s_cbranch_scc0, target); - } - break; - default: unreachable("Unknown Pseudo branch instruction!"); - } - } else if (instr->isReduction()) { Pseudo_reduction_instruction& reduce = instr->reduction(); emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size, diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build index cc4c1e038ee..a4a6066d863 100644 --- a/src/amd/compiler/meson.build +++ b/src/amd/compiler/meson.build @@ -50,6 +50,7 @@ libaco_files = files( 'aco_reduce_assign.cpp', 'aco_register_allocation.cpp', 'aco_live_var_analysis.cpp', + 'aco_lower_branches.cpp', 'aco_lower_phis.cpp', 'aco_lower_subdword.cpp', 'aco_lower_to_cssa.cpp',