mesa/src/amd/compiler/aco_lower_branches.cpp
Daniel Schürmann 26a3038b65 aco/lower_branches: remove edges between blocks if there is no direct branch
This way, linear predecessors and successors better reflect the
actual control flow which improves wait state insertion and hazard
mitigation.

Totals from 10252 (12.91% of 79395) affected shaders: (Navi31)

Instrs: 18824540 -> 18803823 (-0.11%); split: -0.11%, +0.00%
CodeSize: 99025464 -> 98942028 (-0.08%); split: -0.08%, +0.00%
Latency: 169291854 -> 165781877 (-2.07%); split: -2.07%, +0.00%
InvThroughput: 29701086 -> 29228602 (-1.59%); split: -1.59%, +0.00%
SClause: 510587 -> 510586 (-0.00%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32389>
2024-12-12 08:46:22 +00:00

200 lines
6.8 KiB
C++

/*
* Copyright © 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
namespace aco {
namespace {
struct branch_ctx {
Program* program;
branch_ctx(Program* program_) : program(program_) {}
};
void
remove_linear_successor(branch_ctx& ctx, Block& block, uint32_t succ_index)
{
Block& succ = ctx.program->blocks[succ_index];
ASSERTED auto it = std::remove(succ.linear_preds.begin(), succ.linear_preds.end(), block.index);
assert(std::next(it) == succ.linear_preds.end());
succ.linear_preds.pop_back();
it = std::remove(block.linear_succs.begin(), block.linear_succs.end(), succ_index);
assert(std::next(it) == block.linear_succs.end());
block.linear_succs.pop_back();
if (succ.linear_preds.empty()) {
/* This block became unreachable - Recursively remove successors. */
succ.instructions.clear();
for (unsigned i : succ.linear_succs)
remove_linear_successor(ctx, succ, i);
}
}
/**
* Check if the branch instruction can be removed:
* This is beneficial when executing the next block with an empty exec mask
* is faster than the branch instruction itself.
*
* Override this judgement when:
* - The application prefers to remove control flow
* - The compiler stack knows that it's a divergent branch never taken
*/
bool
can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
{
const uint32_t target = branch->target[0];
const bool uniform_branch =
!((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
branch->operands[0].physReg() == exec);
if (branch->never_taken) {
assert(!uniform_branch);
return true;
}
/* Cannot remove back-edges. */
if (block.index >= target)
return false;
const bool prefer_remove = branch->rarely_taken;
unsigned num_scalar = 0;
unsigned num_vector = 0;
/* Check the instructions between branch and target */
for (unsigned i = block.index + 1; i < target; i++) {
/* Uniform conditional branches must not be ignored if they
* are about to jump over actual instructions */
if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
return false;
for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
if (instr->isSOPP()) {
/* Discard early exits and loop breaks and continues should work fine with
* an empty exec mask.
*/
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1) {
bool is_break_continue =
ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
bool discard_early_exit =
ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
if (is_break_continue || discard_early_exit)
continue;
}
return false;
} else if (instr->isSALU()) {
num_scalar++;
} else if (instr->isVALU() || instr->isVINTRP()) {
if (instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
/* writelane ignores exec, writing inactive lanes results in UB. */
return false;
}
num_vector++;
/* VALU which writes SGPRs are always executed on GFX10+ */
if (ctx.program->gfx_level >= GFX10) {
for (Definition& def : instr->definitions) {
if (def.regClass().type() == RegType::sgpr)
num_scalar++;
}
}
} else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
* SMEM might be an invalid access, and barriers are probably expensive. */
return false;
} else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
// TODO: GFX6-9 can use vskip
if (!prefer_remove)
return false;
} else if (instr->opcode != aco_opcode::p_debug_info) {
assert(false && "Pseudo instructions should be lowered by this point.");
return false;
}
if (!prefer_remove) {
/* Under these conditions, we shouldn't remove the branch.
* Don't care about the estimated cycles when the shader prefers flattening.
*/
unsigned est_cycles;
if (ctx.program->gfx_level >= GFX10)
est_cycles = num_scalar * 2 + num_vector;
else
est_cycles = num_scalar * 4 + num_vector * 4;
if (est_cycles > 16)
return false;
}
}
}
return true;
}
void
lower_branch_instruction(branch_ctx& ctx, Block& block)
{
if (block.instructions.empty() || !block.instructions.back()->isBranch())
return;
aco_ptr<Instruction> branch = std::move(block.instructions.back());
const uint32_t target = branch->branch().target[0];
block.instructions.pop_back();
if (can_remove_branch(ctx, block, &branch->branch())) {
if (branch->opcode != aco_opcode::p_branch)
remove_linear_successor(ctx, block, target);
return;
}
/* emit branch instruction */
Builder bld(ctx.program, &block.instructions);
switch (branch->opcode) {
case aco_opcode::p_branch:
assert(block.linear_succs[0] == target);
bld.sopp(aco_opcode::s_branch, target);
break;
case aco_opcode::p_cbranch_nz:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execnz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc1, target);
}
break;
case aco_opcode::p_cbranch_z:
assert(block.linear_succs[1] == target);
if (branch->operands[0].physReg() == exec)
bld.sopp(aco_opcode::s_cbranch_execz, target);
else if (branch->operands[0].physReg() == vcc)
bld.sopp(aco_opcode::s_cbranch_vccz, target);
else {
assert(branch->operands[0].physReg() == scc);
bld.sopp(aco_opcode::s_cbranch_scc0, target);
}
break;
default: unreachable("Unknown Pseudo branch instruction!");
}
}
} /* end namespace */
void
lower_branches(Program* program)
{
branch_ctx ctx(program);
for (int i = program->blocks.size() - 1; i >= 0; i--) {
Block& block = program->blocks[i];
lower_branch_instruction(ctx, block);
}
}
} // namespace aco