mesa/src/amd/compiler/aco_lower_branches.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

662 lines
25 KiB
C++
Raw Normal View History

/*
* Copyright © 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
namespace aco {
namespace {
struct branch_ctx {
Program* program;
std::vector<bool> blocks_incoming_exec_used;
branch_ctx(Program* program_)
: program(program_), blocks_incoming_exec_used(program_->blocks.size(), true)
{}
};
void
remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
{
Block& succ = ctx.program->blocks[succ_index];
ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
assert(std::next(it) == succ.linear_preds.end());
succ.linear_preds.pop_back();
it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
assert(std::next(it) == block.linear_succs.end());
block.linear_succs.pop_back();
if (succ.linear_preds.empty()) {
/* This block became unreachable - Recursively remove successors. */
succ.instructions.clear();
for (unsigned i : succ.linear_succs)
remove_linear_successor(ctx, succ, i);
}
}
void
try_remove_simple_block(branch_ctx& ctx, Block& block)
{
if (!block.instructions.empty() && block.instructions.front()->opcode != aco_opcode::s_branch)
return;
/* Don't remove the preheader as it might be needed as convergence point
* in order to insert code (e.g. for loop alignment, wait states, etc.).
*/
if (block.kind & block_kind_loop_preheader)
return;
unsigned succ_idx = block.linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
Block::edge_vec new_preds;
for (unsigned pred_idx : block.linear_preds) {
Block& pred = ctx.program->blocks[pred_idx];
assert(pred.index < block.index);
assert(!pred.instructions.empty() && pred.instructions.back()->isBranch());
Instruction* branch = pred.instructions.back().get();
if (branch->opcode == aco_opcode::p_branch) {
/* The predecessor unconditionally jumps to this block. Redirect to successor. */
pred.linear_succs[0] = succ_idx;
succ.linear_preds.push_back(pred_idx);
} else if (pred.linear_succs[0] == succ_idx || pred.linear_succs[1] == succ_idx) {
/* The predecessor's alternative target is this block's successor. */
pred.linear_succs[0] = succ_idx;
pred.linear_succs[1] = pred.linear_succs.back(); /* In case of discard */
pred.linear_succs.pop_back();
branch->opcode = aco_opcode::p_branch;
branch->branch().never_taken = false;
branch->branch().rarely_taken = false;
} else if (pred.linear_succs[1] == block.index) {
/* The predecessor jumps to this block. Redirect to successor. */
pred.linear_succs[1] = succ_idx;
succ.linear_preds.push_back(pred_idx);
} else {
/* This block is the fall-through target of the predecessor. */
assert(pred_idx == block.index - 1);
if (block.instructions.empty()) {
/* If this block is empty, just fall-through to the successor. */
pred.linear_succs[0] = succ_idx;
succ.linear_preds.push_back(pred_idx);
continue;
}
/* Otherwise, check if there is a fall-through path for the jump target. */
bool can_fallthrough = block.index < pred.linear_succs[1];
for (unsigned j = block.index + 1; can_fallthrough && j < pred.linear_succs[1]; j++) {
if (!ctx.program->blocks[j].instructions.empty())
can_fallthrough = false;
}
if (!can_fallthrough) {
new_preds.push_back(pred_idx);
continue;
}
pred.linear_succs[0] = pred.linear_succs[1];
pred.linear_succs[1] = succ_idx;
succ.linear_preds.push_back(pred_idx);
/* Invert the condition. This branch now falls through to its original target.
* However, we don't update the fall-through target since this instruction
* gets lowered in the next step, anyway.
*/
if (branch->opcode == aco_opcode::p_cbranch_nz)
branch->opcode = aco_opcode::p_cbranch_z;
else
branch->opcode = aco_opcode::p_cbranch_nz;
branch->branch().never_taken = false;
branch->branch().rarely_taken = false;
}
/* Update the branch target. */
branch->branch().target[0] = succ_idx;
}
/* If this block is part of the logical CFG, also connect pre- and successors. */
if (!block.logical_succs.empty()) {
assert(block.logical_succs.size() == 1);
unsigned logical_succ_idx = block.logical_succs[0];
Block& logical_succ = ctx.program->blocks[logical_succ_idx];
ASSERTED auto it = std::remove(logical_succ.logical_preds.begin(),
logical_succ.logical_preds.end(), block.index);
assert(std::next(it) == logical_succ.logical_preds.end());
logical_succ.logical_preds.pop_back();
for (unsigned pred_idx : block.logical_preds) {
Block& pred = ctx.program->blocks[pred_idx];
std::replace(pred.logical_succs.begin(), pred.logical_succs.end(), block.index,
logical_succ_idx);
if (pred.logical_succs.size() == 2 && pred.logical_succs[0] == pred.logical_succs[1])
pred.logical_succs.pop_back(); /* This should have been optimized in NIR! */
else
logical_succ.logical_preds.push_back(pred_idx);
}
block.logical_succs.clear();
block.logical_preds.clear();
}
block.linear_preds = new_preds;
if (block.linear_preds.empty()) {
remove_linear_successor(ctx, block, succ_idx);
block.instructions.clear();
}
}
bool
instr_uses_reg(aco_ptr<Instruction>& instr, PhysReg reg, uint32_t size)
{
auto intersects = [=](auto src) -> bool
{ return src.physReg() + src.size() > reg && reg + size > src.physReg(); };
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
[=](Definition def) { return intersects(def); }) ||
std::any_of(instr->operands.begin(), instr->operands.end(),
[=](Operand op) { return intersects(op); });
}
void
try_merge_break_with_continue(branch_ctx& ctx, Block& block)
{
/* Look for this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* s[0:1], scc = s_andn2 s[0:1], exec
* s_cbranch_scc0 BB4
* BB3:
* exec = s_mov_b64 s[0:1]
* s_branch BB1
* BB4:
* ...
*
* And turn it into this:
* BB1:
* ...
* p_branch_z exec BB3, BB2
* BB2:
* ...
* BB3:
* s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
* s_cbranch_scc1 BB1, BB4
* BB4:
* ...
*/
if (block.linear_succs.size() != 2 || block.instructions.size() < 2)
return;
Instruction* branch = block.instructions.back().get();
if (branch->opcode != aco_opcode::s_cbranch_scc0)
return;
Block& merge = ctx.program->blocks[block.linear_succs[0]];
Block& loopexit = ctx.program->blocks[block.linear_succs[1]];
/* Just a jump to the loop header. */
if (merge.linear_succs.size() != 1)
return;
for (unsigned merge_pred : merge.linear_preds) {
if (merge_pred == block.index)
continue;
Block& pred = ctx.program->blocks[merge_pred];
Instruction* pred_branch = pred.instructions.back().get();
/* The branch needs to be exec zero only, otherwise we corrupt exec. */
if (pred_branch->opcode != aco_opcode::p_cbranch_z ||
pred_branch->operands[0].physReg() != exec)
return;
}
/* merge block: copy to exec, branch */
if (merge.instructions.size() != 2 || merge.instructions.back()->opcode != aco_opcode::s_branch)
return;
Builder bld(ctx.program);
Instruction* execwrite = merge.instructions[0].get();
if (execwrite->opcode != bld.w64or32(Builder::s_mov) || !execwrite->writes_exec())
return;
/* break block: find s_andn2 */
PhysReg exec_temp = execwrite->operands[0].physReg();
Instruction* execsrc = nullptr;
for (auto rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
aco_ptr<Instruction>& instr = *rit;
if (instr->opcode == bld.w64or32(Builder::s_andn2) &&
instr->definitions[0].physReg() == exec_temp &&
instr->operands[0].physReg() == exec_temp && instr->operands[1].physReg() == exec) {
execsrc = instr.release();
block.instructions.erase(std::next(rit).base());
break;
}
/* There might be copies for phis after the execsrc instructions,
* but these must not read / write the same register.
*/
if (instr->writes_exec() || instr_uses_reg(instr, exec_temp, bld.lm.size()) ||
instr_uses_reg(instr, scc, s1))
break;
}
if (execsrc == nullptr)
return;
/* Use conditional branch in merge block. */
merge.instructions.back()->opcode = aco_opcode::s_cbranch_scc1;
block.linear_succs.pop_back();
block.linear_succs[0] = merge.index;
merge.linear_succs.push_back(loopexit.index);
std::swap(merge.linear_succs[0], merge.linear_succs[1]);
std::replace(loopexit.linear_preds.begin(), loopexit.linear_preds.end(), block.index,
merge.index);
/* Check if we can use the loopexit as the fallthrough block.
* Otherwise, we'll need an extra branch instruction.
*/
for (unsigned i = merge.index + 1; i < loopexit.index; i++) {
if (!ctx.program->blocks[i].instructions.empty()) {
branch->opcode = aco_opcode::s_branch;
merge.instructions.emplace_back(std::move(block.instructions.back()));
break;
}
}
block.instructions.pop_back();
if (ctx.program->gfx_level >= GFX9) {
/* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
Instruction* wr_exec =
bld.sop1(Builder::s_andn2_wrexec, execsrc->definitions[0], execsrc->definitions[1],
Definition(exec, bld.lm), execsrc->operands[0], execsrc->operands[1]);
merge.instructions[0].reset(wr_exec);
} else {
/* Move s_andn2 to the merge block. */
merge.instructions.emplace(merge.instructions.begin(), execsrc);
}
ctx.blocks_incoming_exec_used[merge.index] = true;
}
void
eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
{
bool exec_write_used = false;
if (block.kind & block_kind_end_with_regs) {
/* Last block of a program with succeed shader part should respect final exec write. */
exec_write_used = true;
} else if (!block.linear_succs.empty()) {
/* Check if the successor needs the outgoing exec mask from the current block. */
exec_write_used = ctx.blocks_incoming_exec_used[block.linear_succs[0]];
}
/* Go through all instructions and eliminate useless exec writes. */
for (int i = block.instructions.size() - 1; i >= 0; --i) {
aco_ptr<Instruction>& instr = block.instructions[i];
/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
instr->opcode == aco_opcode::s_cbranch_vccz ||
instr->opcode == aco_opcode::s_cbranch_vccnz) {
exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm];
}
/* See if the current instruction needs or writes exec. */
bool needs_exec = needs_exec_mask(instr.get());
bool writes_exec =
instr->writes_exec() && instr->definitions[0].regClass() == ctx.program->lane_mask;
/* See if we found an unused exec write. */
if (writes_exec && !exec_write_used) {
/* Don't eliminate an instruction that writes registers other than exec and scc.
* It is possible that this is eg. an s_and_saveexec and the saved value is
* used by a later branch.
*/
bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
[](const Definition& def) -> bool
{ return def.physReg() != exec && def.physReg() != scc; });
if (!writes_other) {
instr.reset();
continue;
}
}
/* For a newly encountered exec write, clear the used flag. */
if (writes_exec)
exec_write_used = false;
/* If the current instruction needs exec, mark it as used. */
exec_write_used |= needs_exec;
}
/* Remember if the current block needs an incoming exec mask from its predecessors. */
ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
/* Cleanup: remove deleted instructions from the vector. */
auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
block.instructions.resize(new_end - block.instructions.begin());
}
/**
* Check if the branch instruction can be removed:
* This is beneficial when executing the next block with an empty exec mask
* is faster than the branch instruction itself.
*
* Override this judgement when:
* - The application prefers to remove control flow
* - The compiler stack knows that it's a divergent branch never taken
*/
bool
can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
{
const uint32_t target = branch->target[0];
const bool uniform_branch =
!((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
branch->operands[0].physReg() == exec);
if (branch->never_taken) {
assert(!uniform_branch || std::all_of(std::next(ctx.program->blocks.begin(), block.index + 1),
std::next(ctx.program->blocks.begin(), target),
[](Block& b) { return b.instructions.empty(); }));
return true;
}
/* Cannot remove back-edges. */
if (block.index >= target)
return false;
const bool prefer_remove = branch->rarely_taken;
unsigned num_scalar = 0;
unsigned num_vector = 0;
/* Check the instructions between branch and target */
for (unsigned i = block.index + 1; i < target; i++) {
/* Uniform conditional branches must not be ignored if they
* are about to jump over actual instructions */
if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
return false;
/* Don't enter loops with empty exec mask. */
if (ctx.program->blocks[i].loop_nest_depth > block.loop_nest_depth)
return false;
for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
if (instr->isSOPP()) {
/* Discard early exits and loop breaks and continues should work fine with
* an empty exec mask.
*/
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
instr->opcode == aco_opcode::s_cbranch_execz ||
instr->opcode == aco_opcode::s_cbranch_execnz) {
bool is_break = ctx.program->blocks[i].kind & block_kind_break;
bool discard_early_exit =
ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
if (is_break || discard_early_exit) {
/* If the branch target is the same, we can be sure that it will be taken. */
if (instr->salu().imm == target)
return true;
continue;
}
}
return false;
} else if (instr->isSALU()) {
num_scalar++;
} else if (instr->isVALU() || instr->isVINTRP()) {
if (instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
/* writelane ignores exec, writing inactive lanes results in UB. */
return false;
}
num_vector++;
/* VALU which writes SGPRs are always executed on GFX10+ */
if (ctx.program->gfx_level >= GFX10) {
for (Definition& def : instr->definitions) {
if (def.regClass().type() == RegType::sgpr)
num_scalar++;
}
}
} else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
* SMEM might be an invalid access, and barriers are probably expensive. */
return false;
} else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
if (!prefer_remove)
return false;
} else if (instr->opcode != aco_opcode::p_debug_info) {
assert(false && "Pseudo instructions should be lowered by this point.");
return false;
}
if (!prefer_remove) {
/* Under these conditions, we shouldn't remove the branch.
* Don't care about the estimated cycles when the shader prefers flattening.
*/
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (est_cycles > 16)
return false;
}
}
}
return true;
}
void
lower_branch_instruction(branch_ctx& ctx, Block& block)
{
if (block.instructions.empty() || !block.instructions.back()->isBranch())
return;
aco_ptr<Instruction> branch = std::move(block.instructions.back());
const uint32_t target = branch->branch().target[0];
block.instructions.pop_back();
if (can_remove_branch(ctx, block, &branch->branch())) {
if (branch->opcode != aco_opcode::p_branch)
remove_linear_successor(ctx, block, target);
return;
}
/* emit branch instruction */
Builder bld(ctx.program, &block.instructions);
switch (branch->opcode) {
case aco_opcode::p_branch:
assert(block.linear_succs[0] == target);
bld.sopp(aco_opcode::s_branch, target);
break;
case aco_opcode::p_cbranch_nz:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execnz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc1, target);
}
break;
case aco_opcode::p_cbranch_z:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc0, target);
}
break;
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
default: UNREACHABLE("Unknown Pseudo branch instruction!");
}
}
void
try_stitch_linear_block(branch_ctx& ctx, Block& block)
{
/* Don't stitch blocks that are part of the logical CFG. */
if (block.linear_preds.empty() || block.linear_succs.empty() || !block.logical_preds.empty())
return;
/* Try to stitch this block with the predecessor:
* This block must have exactly one predecessor and
* the predecessor must have exactly one successor.
*/
Block& pred = ctx.program->blocks[block.linear_preds[0]];
if (block.linear_preds.size() == 1 && pred.linear_succs.size() == 1 &&
(pred.instructions.empty() || !pred.instructions.back()->isSOPP())) {
/* Insert the instructions at the end of the predecessor and fixup edges. */
pred.instructions.insert(pred.instructions.end(),
std::move_iterator(block.instructions.begin()),
std::move_iterator(block.instructions.end()));
for (unsigned succ_idx : block.linear_succs) {
Block& s = ctx.program->blocks[succ_idx];
std::replace(s.linear_preds.begin(), s.linear_preds.end(), block.index, pred.index);
}
pred.linear_succs = std::move(block.linear_succs);
block.instructions.clear();
block.linear_preds.clear();
block.linear_succs.clear();
return;
}
/* Try to stitch this block with the successor:
* This block must have exactly one successor and
* the successor must have exactly one predecessor.
*/
Block& succ = ctx.program->blocks[block.linear_succs[0]];
if (block.linear_succs.size() == 1 && succ.linear_preds.size() == 1 &&
(block.instructions.empty() || !block.instructions.back()->isSOPP())) {
/* Insert the instructions at the beginning of the successor. */
succ.instructions.insert(succ.instructions.begin(),
std::move_iterator(block.instructions.begin()),
std::move_iterator(block.instructions.end()));
for (unsigned pred_idx : block.linear_preds) {
Block& p = ctx.program->blocks[pred_idx];
if (!p.instructions.empty() &&
instr_info.classes[(int)p.instructions.back()->opcode] == instr_class::branch &&
p.instructions.back()->salu().imm == block.index) {
p.instructions.back()->salu().imm = succ.index;
}
std::replace(p.linear_succs.begin(), p.linear_succs.end(), block.index, succ.index);
}
succ.linear_preds = std::move(block.linear_preds);
block.instructions.clear();
block.linear_preds.clear();
block.linear_succs.clear();
}
}
void
try_rotate_latch_block(branch_ctx& ctx, Block& header)
{
/* For now, only allow exactly one predecessor block from the loop header
* to become the new loop latch.
*/
if (!(header.kind & block_kind_loop_latch))
return;
/* After jump-threading, the loop header might have more than 2 predecessors. */
assert(header.linear_preds.size() >= 2);
Block& block = ctx.program->blocks[header.linear_preds.back()];
if (block.instructions.empty() || block.instructions.back()->opcode != aco_opcode::s_branch)
return;
/* Check for all predecessors, if they could actually jump back to the loop header. */
for (unsigned pred_idx : block.linear_preds) {
Block& pred = ctx.program->blocks[pred_idx];
assert(pred.index < block.index);
if (pred.instructions.empty() || !pred.instructions.back()->isSOPP())
return;
SALU_instruction* branch = &pred.instructions.back()->salu();
aco_opcode invert = aco_opcode::num_opcodes;
switch (branch->opcode) {
case aco_opcode::s_branch: continue;
case aco_opcode::s_cbranch_execz: invert = aco_opcode::s_cbranch_execnz; break;
case aco_opcode::s_cbranch_execnz: invert = aco_opcode::s_cbranch_execz; break;
case aco_opcode::s_cbranch_vccz: invert = aco_opcode::s_cbranch_vccnz; break;
case aco_opcode::s_cbranch_vccnz: invert = aco_opcode::s_cbranch_vccz; break;
case aco_opcode::s_cbranch_scc0: invert = aco_opcode::s_cbranch_scc1; break;
case aco_opcode::s_cbranch_scc1: invert = aco_opcode::s_cbranch_scc0; break;
default: return;
}
assert(pred.linear_succs.size() >= 2);
if (pred.linear_succs[1] == block.index) {
assert(branch->imm == block.index);
continue;
} else if (pred.linear_succs[0] == block.index) {
/* Check if there is a fall-through path for the jump target. */
if (block.index > pred.linear_succs[1] ||
(ctx.program->blocks[pred.linear_succs[1]].kind & block_kind_loop_latch))
return;
for (unsigned j = block.index + 1; j < pred.linear_succs[1]; j++) {
if (!ctx.program->blocks[j].instructions.empty())
return;
}
/* There can be at most one branch which falls through,
* so just update it directly.
*/
pred.linear_succs[0] = pred.linear_succs[1];
pred.linear_succs[1] = block.index;
branch->opcode = invert;
branch->imm = block.index;
}
}
header.kind &= ~block_kind_loop_latch;
block.kind |= block_kind_loop_latch;
block.instructions.pop_back();
assert(!block.instructions.empty());
/* Insert a new branch at the loop preheader: */
Builder(ctx.program, &ctx.program->blocks[header.index - 1])
.sopp(aco_opcode::s_branch, header.index);
return;
}
} /* end namespace */
void
lower_branches(Program* program)
{
branch_ctx ctx(program);
for (int i = program->blocks.size() - 1; i >= 0; i--) {
Block& block = program->blocks[i];
lower_branch_instruction(ctx, block);
eliminate_useless_exec_writes_in_block(ctx, block);
if (block.kind & block_kind_break)
try_merge_break_with_continue(ctx, block);
if (block.linear_succs.size() == 1 && block.logical_succs.size() <= 1)
try_remove_simple_block(ctx, block);
}
for (int i = program->blocks.size() - 1; i >= 0; i--) {
Block& block = program->blocks[i];
if (block.kind & block_kind_loop_header)
try_rotate_latch_block(ctx, block);
else
try_stitch_linear_block(ctx, block);
}
}
} // namespace aco