mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-28 16:50:10 +01:00
aco: move branch lowering optimization into separate file 'aco_lower_branches.cpp'
No fossil changes. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32389>
This commit is contained in:
parent
845660f2b7
commit
22ffe72022
5 changed files with 182 additions and 134 deletions
|
|
@ -178,6 +178,7 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
|
|||
|
||||
/* Lower to HW Instructions */
|
||||
lower_to_hw_instr(program.get());
|
||||
lower_branches(program.get());
|
||||
validate(program.get());
|
||||
|
||||
if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_VOPD))
|
||||
|
|
@ -422,6 +423,7 @@ aco_compile_trap_handler(const struct aco_compiler_options* options,
|
|||
validate(program.get());
|
||||
|
||||
lower_to_hw_instr(program.get());
|
||||
lower_branches(program.get());
|
||||
validate(program.get());
|
||||
|
||||
insert_waitcnt(program.get());
|
||||
|
|
|
|||
|
|
@ -2252,6 +2252,7 @@ void insert_exec_mask(Program* program);
|
|||
void value_numbering(Program* program);
|
||||
void optimize(Program* program);
|
||||
void optimize_postRA(Program* program);
|
||||
void lower_branches(Program* program);
|
||||
void setup_reduce_temp(Program* program);
|
||||
void lower_to_cssa(Program* program);
|
||||
void register_allocation(Program* program, ra_test_policy = {});
|
||||
|
|
|
|||
178
src/amd/compiler/aco_lower_branches.cpp
Normal file
178
src/amd/compiler/aco_lower_branches.cpp
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
/*
|
||||
* Copyright © 2024 Valve Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "aco_builder.h"
|
||||
#include "aco_ir.h"
|
||||
|
||||
namespace aco {
|
||||
namespace {
|
||||
|
||||
struct branch_ctx {
|
||||
Program* program;
|
||||
|
||||
branch_ctx(Program* program_) : program(program_) {}
|
||||
};
|
||||
|
||||
/**
|
||||
* Check if the branch instruction can be removed:
|
||||
* This is beneficial when executing the next block with an empty exec mask
|
||||
* is faster than the branch instruction itself.
|
||||
*
|
||||
* Override this judgement when:
|
||||
* - The application prefers to remove control flow
|
||||
* - The compiler stack knows that it's a divergent branch never taken
|
||||
*/
|
||||
bool
|
||||
can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* branch)
|
||||
{
|
||||
const uint32_t target = branch->target[0];
|
||||
const bool uniform_branch =
|
||||
!((branch->opcode == aco_opcode::p_cbranch_z || branch->opcode == aco_opcode::p_cbranch_nz) &&
|
||||
branch->operands[0].physReg() == exec);
|
||||
|
||||
if (branch->never_taken) {
|
||||
assert(!uniform_branch);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Cannot remove back-edges. */
|
||||
if (block.index >= target)
|
||||
return false;
|
||||
|
||||
const bool prefer_remove = branch->rarely_taken;
|
||||
unsigned num_scalar = 0;
|
||||
unsigned num_vector = 0;
|
||||
|
||||
/* Check the instructions between branch and target */
|
||||
for (unsigned i = block.index + 1; i < target; i++) {
|
||||
/* Uniform conditional branches must not be ignored if they
|
||||
* are about to jump over actual instructions */
|
||||
if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
|
||||
return false;
|
||||
|
||||
for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
|
||||
if (instr->isSOPP()) {
|
||||
/* Discard early exits and loop breaks and continues should work fine with
|
||||
* an empty exec mask.
|
||||
*/
|
||||
if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
|
||||
instr->opcode == aco_opcode::s_cbranch_scc1) {
|
||||
bool is_break_continue =
|
||||
ctx.program->blocks[i].kind & (block_kind_break | block_kind_continue);
|
||||
bool discard_early_exit =
|
||||
ctx.program->blocks[instr->salu().imm].kind & block_kind_discard_early_exit;
|
||||
if (is_break_continue || discard_early_exit)
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
} else if (instr->isSALU()) {
|
||||
num_scalar++;
|
||||
} else if (instr->isVALU() || instr->isVINTRP()) {
|
||||
if (instr->opcode == aco_opcode::v_writelane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
/* writelane ignores exec, writing inactive lanes results in UB. */
|
||||
return false;
|
||||
}
|
||||
num_vector++;
|
||||
/* VALU which writes SGPRs are always executed on GFX10+ */
|
||||
if (ctx.program->gfx_level >= GFX10) {
|
||||
for (Definition& def : instr->definitions) {
|
||||
if (def.regClass().type() == RegType::sgpr)
|
||||
num_scalar++;
|
||||
}
|
||||
}
|
||||
} else if (instr->isEXP() || instr->isSMEM() || instr->isBarrier()) {
|
||||
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
|
||||
* SMEM might be an invalid access, and barriers are probably expensive. */
|
||||
return false;
|
||||
} else if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isLDSDIR()) {
|
||||
// TODO: GFX6-9 can use vskip
|
||||
if (!prefer_remove)
|
||||
return false;
|
||||
} else if (instr->opcode != aco_opcode::p_debug_info) {
|
||||
assert(false && "Pseudo instructions should be lowered by this point.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!prefer_remove) {
|
||||
/* Under these conditions, we shouldn't remove the branch.
|
||||
* Don't care about the estimated cycles when the shader prefers flattening.
|
||||
*/
|
||||
unsigned est_cycles;
|
||||
if (ctx.program->gfx_level >= GFX10)
|
||||
est_cycles = num_scalar * 2 + num_vector;
|
||||
else
|
||||
est_cycles = num_scalar * 4 + num_vector * 4;
|
||||
|
||||
if (est_cycles > 16)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
lower_branch_instruction(branch_ctx& ctx, Block& block)
|
||||
{
|
||||
if (block.instructions.empty() || !block.instructions.back()->isBranch())
|
||||
return;
|
||||
|
||||
aco_ptr<Instruction> branch = std::move(block.instructions.back());
|
||||
const uint32_t target = branch->branch().target[0];
|
||||
block.instructions.pop_back();
|
||||
|
||||
if (can_remove_branch(ctx, block, &branch->branch()))
|
||||
return;
|
||||
|
||||
/* emit branch instruction */
|
||||
Builder bld(ctx.program, &block.instructions);
|
||||
switch (branch->opcode) {
|
||||
case aco_opcode::p_branch:
|
||||
assert(block.linear_succs[0] == target);
|
||||
bld.sopp(aco_opcode::s_branch, target);
|
||||
break;
|
||||
case aco_opcode::p_cbranch_nz:
|
||||
assert(block.linear_succs[1] == target);
|
||||
if (branch->operands[0].physReg() == exec)
|
||||
bld.sopp(aco_opcode::s_cbranch_execnz, target);
|
||||
else if (branch->operands[0].physReg() == vcc)
|
||||
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
|
||||
else {
|
||||
assert(branch->operands[0].physReg() == scc);
|
||||
bld.sopp(aco_opcode::s_cbranch_scc1, target);
|
||||
}
|
||||
break;
|
||||
case aco_opcode::p_cbranch_z:
|
||||
assert(block.linear_succs[1] == target);
|
||||
if (branch->operands[0].physReg() == exec)
|
||||
bld.sopp(aco_opcode::s_cbranch_execz, target);
|
||||
else if (branch->operands[0].physReg() == vcc)
|
||||
bld.sopp(aco_opcode::s_cbranch_vccz, target);
|
||||
else {
|
||||
assert(branch->operands[0].physReg() == scc);
|
||||
bld.sopp(aco_opcode::s_cbranch_scc0, target);
|
||||
}
|
||||
break;
|
||||
default: unreachable("Unknown Pseudo branch instruction!");
|
||||
}
|
||||
}
|
||||
|
||||
} /* end namespace */
|
||||
|
||||
void
|
||||
lower_branches(Program* program)
|
||||
{
|
||||
branch_ctx ctx(program);
|
||||
|
||||
for (int i = program->blocks.size() - 1; i >= 0; i--) {
|
||||
Block& block = program->blocks[i];
|
||||
lower_branch_instruction(ctx, block);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
@ -2843,140 +2843,6 @@ lower_to_hw_instr(Program* program)
|
|||
}
|
||||
default: break;
|
||||
}
|
||||
} else if (instr->isBranch()) {
|
||||
Pseudo_branch_instruction* branch = &instr->branch();
|
||||
const uint32_t target = branch->target[0];
|
||||
const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
|
||||
branch->opcode == aco_opcode::p_cbranch_nz) &&
|
||||
branch->operands[0].physReg() == exec);
|
||||
|
||||
if (branch->never_taken) {
|
||||
assert(!uniform_branch);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Check if the branch instruction can be removed.
|
||||
* This is beneficial when executing the next block with an empty exec mask
|
||||
* is faster than the branch instruction itself.
|
||||
*
|
||||
* Override this judgement when:
|
||||
* - The application prefers to remove control flow
|
||||
* - The compiler stack knows that it's a divergent branch always taken
|
||||
*/
|
||||
const bool prefer_remove = branch->rarely_taken;
|
||||
bool can_remove = block->index < target;
|
||||
unsigned num_scalar = 0;
|
||||
unsigned num_vector = 0;
|
||||
|
||||
/* Check the instructions between branch and target */
|
||||
for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
|
||||
/* Uniform conditional branches must not be ignored if they
|
||||
* are about to jump over actual instructions */
|
||||
if (uniform_branch && !program->blocks[i].instructions.empty())
|
||||
can_remove = false;
|
||||
|
||||
if (!can_remove)
|
||||
break;
|
||||
|
||||
for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
|
||||
if (inst->isSOPP()) {
|
||||
if (instr_info.classes[(int)inst->opcode] == instr_class::branch) {
|
||||
/* Discard early exits and loop breaks and continues should work fine with
|
||||
* an empty exec mask.
|
||||
*/
|
||||
bool is_break_continue =
|
||||
program->blocks[i].kind & (block_kind_break | block_kind_continue);
|
||||
bool discard_early_exit =
|
||||
program->blocks[inst->salu().imm].kind & block_kind_discard_early_exit;
|
||||
if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
|
||||
inst->opcode != aco_opcode::s_cbranch_scc1) ||
|
||||
(!discard_early_exit && !is_break_continue))
|
||||
can_remove = false;
|
||||
} else {
|
||||
can_remove = false;
|
||||
}
|
||||
} else if (inst->isSALU()) {
|
||||
num_scalar++;
|
||||
} else if (inst->isVALU() || inst->isVINTRP()) {
|
||||
if (inst->opcode == aco_opcode::v_writelane_b32 ||
|
||||
inst->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
/* writelane ignores exec, writing inactive lanes results in UB. */
|
||||
can_remove = false;
|
||||
}
|
||||
num_vector++;
|
||||
/* VALU which writes SGPRs are always executed on GFX10+ */
|
||||
if (ctx.program->gfx_level >= GFX10) {
|
||||
for (Definition& def : inst->definitions) {
|
||||
if (def.regClass().type() == RegType::sgpr)
|
||||
num_scalar++;
|
||||
}
|
||||
}
|
||||
} else if (inst->isEXP() || inst->isSMEM() || inst->isBarrier()) {
|
||||
/* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs),
|
||||
* SMEM might be an invalid access, and barriers are probably expensive. */
|
||||
can_remove = false;
|
||||
} else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
|
||||
inst->isLDSDIR()) {
|
||||
// TODO: GFX6-9 can use vskip
|
||||
can_remove = prefer_remove;
|
||||
} else if (inst->opcode != aco_opcode::p_debug_info) {
|
||||
can_remove = false;
|
||||
assert(false && "Pseudo instructions should be lowered by this point.");
|
||||
}
|
||||
|
||||
if (!prefer_remove) {
|
||||
/* Under these conditions, we shouldn't remove the branch.
|
||||
* Don't care about the estimated cycles when the shader prefers flattening.
|
||||
*/
|
||||
unsigned est_cycles;
|
||||
if (ctx.program->gfx_level >= GFX10)
|
||||
est_cycles = num_scalar * 2 + num_vector;
|
||||
else
|
||||
est_cycles = num_scalar * 4 + num_vector * 4;
|
||||
|
||||
if (est_cycles > 16)
|
||||
can_remove = false;
|
||||
}
|
||||
|
||||
if (!can_remove)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (can_remove)
|
||||
continue;
|
||||
|
||||
/* emit branch instruction */
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::p_branch:
|
||||
assert(block->linear_succs[0] == target);
|
||||
bld.sopp(aco_opcode::s_branch, target);
|
||||
break;
|
||||
case aco_opcode::p_cbranch_nz:
|
||||
assert(block->linear_succs[1] == target);
|
||||
if (branch->operands[0].physReg() == exec)
|
||||
bld.sopp(aco_opcode::s_cbranch_execnz, target);
|
||||
else if (branch->operands[0].physReg() == vcc)
|
||||
bld.sopp(aco_opcode::s_cbranch_vccnz, target);
|
||||
else {
|
||||
assert(branch->operands[0].physReg() == scc);
|
||||
bld.sopp(aco_opcode::s_cbranch_scc1, target);
|
||||
}
|
||||
break;
|
||||
case aco_opcode::p_cbranch_z:
|
||||
assert(block->linear_succs[1] == target);
|
||||
if (branch->operands[0].physReg() == exec)
|
||||
bld.sopp(aco_opcode::s_cbranch_execz, target);
|
||||
else if (branch->operands[0].physReg() == vcc)
|
||||
bld.sopp(aco_opcode::s_cbranch_vccz, target);
|
||||
else {
|
||||
assert(branch->operands[0].physReg() == scc);
|
||||
bld.sopp(aco_opcode::s_cbranch_scc0, target);
|
||||
}
|
||||
break;
|
||||
default: unreachable("Unknown Pseudo branch instruction!");
|
||||
}
|
||||
|
||||
} else if (instr->isReduction()) {
|
||||
Pseudo_reduction_instruction& reduce = instr->reduction();
|
||||
emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ libaco_files = files(
|
|||
'aco_reduce_assign.cpp',
|
||||
'aco_register_allocation.cpp',
|
||||
'aco_live_var_analysis.cpp',
|
||||
'aco_lower_branches.cpp',
|
||||
'aco_lower_phis.cpp',
|
||||
'aco_lower_subdword.cpp',
|
||||
'aco_lower_to_cssa.cpp',
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue