mesa/src/amd/compiler/aco_insert_exec_mask.cpp
Daniel Schürmann fdb97d3d29 aco: execute branch instructions in WQM if necessary
It could happen that only the branch condition was computed in WQM
and not the branch instruction.
There is now some rendundancy which should be cleaned up.

Fixes: 3817fa7a4d ('aco: fix WQM handling in nested loops')
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6260>
2020-08-11 15:35:59 +00:00

1147 lines
46 KiB
C++

/*
* Copyright © 2019 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
#include "aco_ir.h"
#include "aco_builder.h"
#include "util/u_math.h"
namespace aco {
namespace {
enum WQMState : uint8_t {
Unspecified = 0,
Exact = 1 << 0,
WQM = 1 << 1, /* with control flow applied */
Preserve_WQM = 1 << 2,
Exact_Branch = 1 << 3,
};
enum mask_type : uint8_t {
mask_type_global = 1 << 0,
mask_type_exact = 1 << 1,
mask_type_wqm = 1 << 2,
mask_type_loop = 1 << 3, /* active lanes of a loop */
mask_type_initial = 1 << 4, /* initially active lanes */
};
struct wqm_ctx {
Program* program;
/* state for WQM propagation */
std::set<unsigned> worklist;
std::vector<uint16_t> defined_in;
std::vector<bool> needs_wqm;
std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
bool loop;
bool wqm;
wqm_ctx(Program* program) : program(program),
defined_in(program->peekAllocationId(), 0xFFFF),
needs_wqm(program->peekAllocationId()),
branch_wqm(program->blocks.size()),
loop(false),
wqm(false)
{
for (unsigned i = 0; i < program->blocks.size(); i++)
worklist.insert(i);
}
};
struct loop_info {
Block* loop_header;
uint16_t num_exec_masks;
uint8_t needs;
bool has_divergent_break;
bool has_divergent_continue;
bool has_discard; /* has a discard or demote */
loop_info(Block* b, uint16_t num, uint8_t needs, bool breaks, bool cont, bool discard) :
loop_header(b), num_exec_masks(num), needs(needs), has_divergent_break(breaks),
has_divergent_continue(cont), has_discard(discard) {}
};
struct block_info {
std::vector<std::pair<Temp, uint8_t>> exec;
std::vector<WQMState> instr_needs;
uint8_t block_needs;
uint8_t ever_again_needs;
bool logical_end_wqm;
/* more... */
};
struct exec_ctx {
Program *program;
std::vector<block_info> info;
std::vector<loop_info> loop;
bool handle_wqm = false;
exec_ctx(Program *program) : program(program), info(program->blocks.size()) {}
};
bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
if (instr->isSALU())
return instr->reads_exec();
if (instr->format == Format::SMEM || instr->isSALU())
return false;
if (instr->format == Format::PSEUDO_BARRIER)
return false;
if (instr->format == Format::PSEUDO) {
switch (instr->opcode) {
case aco_opcode::p_create_vector:
case aco_opcode::p_extract_vector:
case aco_opcode::p_split_vector:
for (Definition def : instr->definitions) {
if (def.getTemp().type() == RegType::vgpr)
return true;
}
return false;
case aco_opcode::p_spill:
case aco_opcode::p_reload:
return false;
default:
break;
}
}
if (instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64)
return false;
return true;
}
bool needs_exact(aco_ptr<Instruction>& instr) {
if (instr->format == Format::MUBUF) {
MUBUF_instruction *mubuf = static_cast<MUBUF_instruction *>(instr.get());
return mubuf->disable_wqm;
} else if (instr->format == Format::MTBUF) {
MTBUF_instruction *mtbuf = static_cast<MTBUF_instruction *>(instr.get());
return mtbuf->disable_wqm;
} else if (instr->format == Format::MIMG) {
MIMG_instruction *mimg = static_cast<MIMG_instruction *>(instr.get());
return mimg->disable_wqm;
} else if (instr->format == Format::FLAT || instr->format == Format::GLOBAL) {
FLAT_instruction *flat = static_cast<FLAT_instruction *>(instr.get());
return flat->disable_wqm;
} else {
return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem;
}
}
void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
{
if (!ctx.needs_wqm[tmp.id()]) {
ctx.needs_wqm[tmp.id()] = true;
if (ctx.defined_in[tmp.id()] != 0xFFFF)
ctx.worklist.insert(ctx.defined_in[tmp.id()]);
}
}
void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
{
if (ctx.branch_wqm[block_idx])
return;
ctx.branch_wqm[block_idx] = true;
Block& block = ctx.program->blocks[block_idx];
/* TODO: this sets more branch conditions to WQM than it needs to
* it should be enough to stop at the "exec mask top level" */
if (block.kind & block_kind_top_level)
return;
for (unsigned pred_idx : block.logical_preds)
mark_block_wqm(ctx, pred_idx);
}
void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
{
block_info& info = exec_ctx.info[block->index];
std::vector<WQMState> instr_needs(block->instructions.size());
if (block->kind & block_kind_top_level) {
if (ctx.loop && ctx.wqm) {
unsigned block_idx = block->index + 1;
while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) {
/* flag all break conditions as WQM:
* the conditions might be computed outside the nested CF */
if (ctx.program->blocks[block_idx].kind & block_kind_break)
mark_block_wqm(ctx, block_idx);
/* flag all blocks as WQM to ensure we enter all (nested) loops in WQM */
exec_ctx.info[block_idx].block_needs |= WQM;
block_idx++;
}
} else if (ctx.loop && !ctx.wqm) {
/* Ensure a branch never results in an exec mask with only helper
* invocations (which can cause a loop to repeat infinitively if it's
* break branches are done in exact). */
unsigned block_idx = block->index;
do {
if ((ctx.program->blocks[block_idx].kind & block_kind_branch))
exec_ctx.info[block_idx].block_needs |= Exact_Branch;
block_idx++;
} while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level));
}
ctx.loop = false;
ctx.wqm = false;
}
for (int i = block->instructions.size() - 1; i >= 0; --i) {
aco_ptr<Instruction>& instr = block->instructions[i];
WQMState needs = needs_exact(instr) ? Exact : Unspecified;
bool propagate_wqm = instr->opcode == aco_opcode::p_wqm;
bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
bool pred_by_exec = pred_by_exec_mask(instr);
for (const Definition& definition : instr->definitions) {
if (!definition.isTemp())
continue;
const unsigned def = definition.tempId();
ctx.defined_in[def] = block->index;
if (needs == Unspecified && ctx.needs_wqm[def]) {
needs = pred_by_exec ? WQM : Unspecified;
propagate_wqm = true;
}
}
if (instr->format == Format::PSEUDO_BRANCH && ctx.branch_wqm[block->index]) {
needs = WQM;
propagate_wqm = true;
}
if (propagate_wqm) {
for (const Operand& op : instr->operands) {
if (op.isTemp()) {
set_needs_wqm(ctx, op.getTemp());
}
}
} else if (preserve_wqm && info.block_needs & WQM) {
needs = Preserve_WQM;
}
/* ensure the condition controlling the control flow for this phi is in WQM */
if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
for (unsigned pred_idx : block->logical_preds) {
mark_block_wqm(ctx, pred_idx);
exec_ctx.info[pred_idx].logical_end_wqm = true;
ctx.worklist.insert(pred_idx);
}
}
if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) ||
instr->opcode == aco_opcode::p_wqm) {
assert(needs != Exact);
needs = WQM;
}
instr_needs[i] = needs;
info.block_needs |= needs;
}
info.instr_needs = instr_needs;
/* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
* <cond> should be computed in WQM */
if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
for (unsigned pred_idx : block->logical_preds)
mark_block_wqm(ctx, pred_idx);
ctx.wqm = true;
}
if (block->kind & block_kind_loop_header)
ctx.loop = true;
}
void calculate_wqm_needs(exec_ctx& exec_ctx)
{
wqm_ctx ctx(exec_ctx.program);
while (!ctx.worklist.empty()) {
unsigned block_index = *std::prev(ctx.worklist.end());
ctx.worklist.erase(std::prev(ctx.worklist.end()));
get_block_needs(ctx, exec_ctx, &exec_ctx.program->blocks[block_index]);
}
uint8_t ever_again_needs = 0;
for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) {
exec_ctx.info[i].ever_again_needs = ever_again_needs;
Block& block = exec_ctx.program->blocks[i];
if (block.kind & block_kind_needs_lowering)
exec_ctx.info[i].block_needs |= Exact;
/* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
if ((block.kind & block_kind_discard ||
block.kind & block_kind_uses_discard_if) &&
ever_again_needs & WQM)
exec_ctx.info[i].block_needs |= Preserve_WQM;
ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
if (block.kind & block_kind_discard ||
block.kind & block_kind_uses_discard_if ||
block.kind & block_kind_uses_demote)
ever_again_needs |= Exact;
/* don't propagate WQM preservation further than the next top_level block */
if (block.kind & block_kind_top_level)
ever_again_needs &= ~Preserve_WQM;
else
exec_ctx.info[i].block_needs &= ~Preserve_WQM;
}
exec_ctx.handle_wqm = true;
}
void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
{
if (ctx.info[idx].exec.back().second & mask_type_wqm)
return;
if (ctx.info[idx].exec.back().second & mask_type_global) {
Temp exec_mask = ctx.info[idx].exec.back().first;
/* TODO: we might generate better code if we pass the uncopied "exec_mask"
* directly to the s_wqm (we still need to keep this parallelcopy for
* potential later uses of exec_mask though). We currently can't do this
* because of a RA bug. */
exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), bld.exec(exec_mask));
ctx.info[idx].exec.back().first = exec_mask;
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
return;
}
/* otherwise, the WQM mask should be one below the current mask */
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
}
void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
{
if (ctx.info[idx].exec.back().second & mask_type_exact)
return;
/* We can't remove the loop exec mask, because that can cause exec.size() to
* be less than num_exec_masks. The loop exec mask also needs to be kept
* around for various uses. */
if ((ctx.info[idx].exec.back().second & mask_type_global) &&
!(ctx.info[idx].exec.back().second & mask_type_loop)) {
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
return;
}
/* otherwise, we create an exact mask and push to the stack */
Temp wqm = ctx.info[idx].exec.back().first;
Temp exact = bld.tmp(bld.lm);
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
ctx.info[idx].exec.back().first = wqm;
ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
}
unsigned add_coupling_code(exec_ctx& ctx, Block* block,
std::vector<aco_ptr<Instruction>>& instructions)
{
unsigned idx = block->index;
Builder bld(ctx.program, &instructions);
std::vector<unsigned>& preds = block->linear_preds;
/* start block */
if (idx == 0) {
aco_ptr<Instruction>& startpgm = block->instructions[0];
assert(startpgm->opcode == aco_opcode::p_startpgm);
Temp exec_mask = startpgm->definitions.back().getTemp();
bld.insert(std::move(startpgm));
/* exec seems to need to be manually initialized with combined shaders */
if (util_bitcount(ctx.program->stage & sw_mask) > 1 || (ctx.program->stage & hw_ngg_gs)) {
bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX));
instructions[0]->definitions.pop_back();
}
if (ctx.handle_wqm) {
ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial);
/* if this block only needs WQM, initialize already */
if (ctx.info[0].block_needs == WQM)
transition_to_WQM(ctx, bld, 0);
} else {
uint8_t mask = mask_type_global;
if (ctx.program->needs_wqm) {
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
mask |= mask_type_wqm;
} else {
mask |= mask_type_exact;
}
ctx.info[0].exec.emplace_back(exec_mask, mask);
}
return 1;
}
/* loop entry block */
if (block->kind & block_kind_loop_header) {
assert(preds[0] == idx - 1);
ctx.info[idx].exec = ctx.info[idx - 1].exec;
loop_info& info = ctx.loop.back();
while (ctx.info[idx].exec.size() > info.num_exec_masks)
ctx.info[idx].exec.pop_back();
/* create ssa names for outer exec masks */
if (info.has_discard) {
aco_ptr<Pseudo_instruction> phi;
for (int i = 0; i < info.num_exec_masks - 1; i++) {
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
}
}
/* create ssa name for restore mask */
if (info.has_divergent_break) {
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
}
/* create ssa name for loop active mask */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
if (info.has_divergent_continue)
phi->definitions[0] = bld.def(bld.lm);
else
phi->definitions[0] = bld.def(bld.lm, exec);
phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
Temp loop_active = bld.insert(std::move(phi));
if (info.has_divergent_break) {
uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
ctx.info[idx].exec.emplace_back(loop_active, mask_type);
} else {
ctx.info[idx].exec.back().first = loop_active;
ctx.info[idx].exec.back().second |= mask_type_loop;
}
/* create a parallelcopy to move the active mask to exec */
unsigned i = 0;
if (info.has_divergent_continue) {
while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
bld.insert(std::move(block->instructions[i]));
i++;
}
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first), mask_type);
}
return i;
}
/* loop exit block */
if (block->kind & block_kind_loop_exit) {
Block* header = ctx.loop.back().loop_header;
loop_info& info = ctx.loop.back();
for (ASSERTED unsigned pred : preds)
assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
/* fill the loop header phis */
std::vector<unsigned>& header_preds = header->linear_preds;
int k = 0;
if (info.has_discard) {
while (k < info.num_exec_masks - 1) {
aco_ptr<Instruction>& phi = header->instructions[k];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[k].first);
k++;
}
}
aco_ptr<Instruction>& phi = header->instructions[k++];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
if (info.has_divergent_break) {
aco_ptr<Instruction>& phi = header->instructions[k];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
}
assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
/* create the loop exit phis if not trivial */
bool need_parallelcopy = false;
for (unsigned k = 0; k < info.num_exec_masks; k++) {
Temp same = ctx.info[preds[0]].exec[k].first;
uint8_t type = ctx.info[header_preds[0]].exec[k].second;
bool trivial = true;
for (unsigned i = 1; i < preds.size() && trivial; i++) {
if (ctx.info[preds[i]].exec[k].first != same)
trivial = false;
}
if (k == info.num_exec_masks - 1u) {
bool all_liveout_exec = true;
bool all_not_liveout_exec = true;
for (unsigned pred : preds) {
all_liveout_exec = all_liveout_exec && same == ctx.program->blocks[pred].live_out_exec;
all_not_liveout_exec = all_not_liveout_exec && same != ctx.program->blocks[pred].live_out_exec;
}
if (!all_liveout_exec && !all_not_liveout_exec)
trivial = false;
else if (all_not_liveout_exec)
need_parallelcopy = true;
need_parallelcopy |= !trivial;
}
if (trivial) {
ctx.info[idx].exec.emplace_back(same, type);
} else {
/* create phi for loop footer */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
if (k == info.num_exec_masks - 1u) {
phi->definitions[0].setFixed(exec);
need_parallelcopy = false;
}
for (unsigned i = 0; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
}
}
assert(ctx.info[idx].exec.size() == info.num_exec_masks);
/* create a parallelcopy to move the live mask to exec */
unsigned i = 0;
while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
bld.insert(std::move(block->instructions[i]));
i++;
}
if (ctx.handle_wqm) {
if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
(ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
ctx.info[idx].exec.back().second |= mask_type_global;
transition_to_Exact(ctx, bld, idx);
ctx.handle_wqm = false;
}
}
if (ctx.info[idx].block_needs == WQM)
transition_to_WQM(ctx, bld, idx);
else if (ctx.info[idx].block_needs == Exact)
transition_to_Exact(ctx, bld, idx);
}
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
if (need_parallelcopy) {
/* only create this parallelcopy is needed, since the operand isn't
* fixed to exec which causes the spiller to miscalculate register demand */
/* TODO: Fix register_demand calculation for spilling on loop exits.
* The problem is only mitigated because the register demand could be
* higher if the exec phi doesn't get assigned to exec. */
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
}
ctx.loop.pop_back();
return i;
}
if (preds.size() == 1) {
ctx.info[idx].exec = ctx.info[preds[0]].exec;
} else {
assert(preds.size() == 2);
/* if one of the predecessors ends in exact mask, we pop it from stack */
unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
ctx.info[preds[1]].exec.size());
if (block->kind & block_kind_top_level && !(block->kind & block_kind_merge))
num_exec_masks = std::min(num_exec_masks, 2u);
/* create phis for diverged exec masks */
for (unsigned i = 0; i < num_exec_masks; i++) {
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
if (!in_exec && ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
assert(ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
ctx.info[idx].exec.emplace_back(ctx.info[preds[0]].exec[i]);
continue;
}
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
ctx.info[preds[0]].exec[i].first,
ctx.info[preds[1]].exec[i].first);
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
ctx.info[idx].exec.emplace_back(phi, mask_type);
}
}
unsigned i = 0;
while (block->instructions[i]->opcode == aco_opcode::p_phi ||
block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
bld.insert(std::move(block->instructions[i]));
i++;
}
if (block->kind & block_kind_merge)
ctx.info[idx].exec.pop_back();
if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 3) {
assert(ctx.info[idx].exec.back().second == mask_type_exact);
assert(block->kind & block_kind_merge);
ctx.info[idx].exec.pop_back();
}
/* try to satisfy the block's needs */
if (ctx.handle_wqm) {
if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
(ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
ctx.info[idx].exec.back().second |= mask_type_global;
transition_to_Exact(ctx, bld, idx);
ctx.handle_wqm = false;
}
}
if (ctx.info[idx].block_needs == WQM)
transition_to_WQM(ctx, bld, idx);
else if (ctx.info[idx].block_needs == Exact)
transition_to_Exact(ctx, bld, idx);
}
if (block->kind & block_kind_merge) {
Temp restore = ctx.info[idx].exec.back().first;
assert(restore.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
}
return i;
}
void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instruction>& instr, Temp cur_exec)
{
Operand offset = instr->operands[1];
if (need_check) {
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
if (offset.isLiteral())
offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
offset, Operand(UINT32_MAX), bld.scc(nonempty));
} else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset);
}
if (!offset.isConstant())
offset.setFixed(m0);
switch (instr->operands[2].size()) {
case 1:
instr->opcode = aco_opcode::s_buffer_store_dword;
break;
case 2:
instr->opcode = aco_opcode::s_buffer_store_dwordx2;
break;
case 4:
instr->opcode = aco_opcode::s_buffer_store_dwordx4;
break;
default:
unreachable("Invalid SMEM buffer store size");
}
instr->operands[1] = offset;
/* as_uniform() needs to be done here so it's done in exact mode and helper
* lanes don't contribute. */
instr->operands[2] = Operand(bld.as_uniform(instr->operands[2]));
}
void process_instructions(exec_ctx& ctx, Block* block,
std::vector<aco_ptr<Instruction>>& instructions,
unsigned idx)
{
WQMState state;
if (ctx.info[block->index].exec.back().second & mask_type_wqm)
state = WQM;
else {
assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
state = Exact;
}
/* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
bool process = (ctx.handle_wqm &&
(ctx.info[block->index].block_needs & state) !=
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
block->kind & block_kind_uses_discard_if ||
block->kind & block_kind_uses_demote ||
block->kind & block_kind_needs_lowering;
if (!process) {
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
instructions.insert(instructions.end(),
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
return;
}
Builder bld(ctx.program, &instructions);
for (; idx < block->instructions.size(); idx++) {
aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified;
if (instr->opcode == aco_opcode::p_discard_if) {
if (ctx.info[block->index].block_needs & Preserve_WQM) {
assert(block->kind & block_kind_top_level);
transition_to_WQM(ctx, bld, block->index);
ctx.info[block->index].exec.back().second &= ~mask_type_global;
}
int num = ctx.info[block->index].exec.size();
assert(num);
Operand cond = instr->operands[0];
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
andn2->definitions[0].setFixed(exec);
}
if (i == 0) {
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
}
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
}
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
} else if (needs == WQM && state != WQM) {
transition_to_WQM(ctx, bld, block->index);
state = WQM;
} else if (needs == Exact && state != Exact) {
transition_to_Exact(ctx, bld, block->index);
state = Exact;
}
if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
Definition dst = instr->definitions[0];
assert(dst.size() == bld.lm.size());
if (state == Exact) {
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
instr->operands[0] = Operand(0u);
instr->definitions[0] = dst;
} else {
std::pair<Temp, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
if (instr->opcode == aco_opcode::p_load_helper &&
!(ctx.info[block->index].exec[0].second & mask_type_initial)) {
/* find last initial exact mask */
for (int i = block->index; i >= 0; i--) {
if (ctx.program->blocks[i].kind & block_kind_top_level &&
ctx.info[i].exec[0].second & mask_type_initial) {
exact_mask = ctx.info[i].exec[0];
break;
}
}
}
assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
assert(exact_mask.second & mask_type_exact);
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
instr->operands[1] = Operand(exact_mask.first);
instr->definitions[0] = dst;
instr->definitions[1] = bld.def(s1, scc);
}
} else if (instr->opcode == aco_opcode::p_demote_to_helper) {
/* turn demote into discard_if with only exact masks */
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
ctx.info[block->index].exec[0].second &= ~mask_type_initial;
int num;
Temp cond, exit_cond;
if (instr->operands[0].isConstant()) {
assert(instr->operands[0].constantValue() == -1u);
/* transition to exact and set exec to zero */
Temp old_exec = ctx.info[block->index].exec.back().first;
Temp new_exec = bld.tmp(bld.lm);
exit_cond = bld.tmp(s1);
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
num = ctx.info[block->index].exec.size() - 2;
if (ctx.info[block->index].exec.back().second & mask_type_exact) {
ctx.info[block->index].exec.back().first = new_exec;
} else {
ctx.info[block->index].exec.back().first = cond;
ctx.info[block->index].exec.emplace_back(new_exec, mask_type_exact);
}
} else {
/* demote_if: transition to exact */
transition_to_Exact(ctx, bld, block->index);
assert(instr->operands[0].isTemp());
cond = instr->operands[0].getTemp();
num = ctx.info[block->index].exec.size() - 1;
}
for (int i = num; i >= 0; i--) {
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[block->index].exec.size() - 1) {
andn2->operands[0].setFixed(exec);
andn2->definitions[0].setFixed(exec);
}
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
exit_cond = andn2->definitions[1].getTemp();
} else {
assert(i != 0);
}
}
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(exit_cond);
state = Exact;
} else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
bool need_check = ctx.info[block->index].exec.size() != 1 &&
!(ctx.info[block->index].exec[ctx.info[block->index].exec.size() - 2].second & Exact);
lower_fs_buffer_store_smem(bld, need_check, instr, ctx.info[block->index].exec.back().first);
}
bld.insert(std::move(instr));
}
}
void add_branch_code(exec_ctx& ctx, Block* block)
{
unsigned idx = block->index;
Builder bld(ctx.program, block);
if (idx == ctx.program->blocks.size() - 1)
return;
/* try to disable wqm handling */
if (ctx.handle_wqm && block->kind & block_kind_top_level) {
if (ctx.info[idx].exec.size() == 3) {
assert(ctx.info[idx].exec[1].second == mask_type_wqm);
ctx.info[idx].exec.pop_back();
}
assert(ctx.info[idx].exec.size() <= 2);
if (ctx.info[idx].ever_again_needs == 0 ||
ctx.info[idx].ever_again_needs == Exact) {
/* transition to Exact */
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
ctx.info[idx].exec.back().second |= mask_type_global;
transition_to_Exact(ctx, bld, idx);
bld.insert(std::move(branch));
ctx.handle_wqm = false;
} else if (ctx.info[idx].block_needs & Preserve_WQM) {
/* transition to WQM and remove global flag */
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
transition_to_WQM(ctx, bld, idx);
ctx.info[idx].exec.back().second &= ~mask_type_global;
bld.insert(std::move(branch));
}
}
if (block->kind & block_kind_loop_preheader) {
/* collect information about the succeeding loop */
bool has_divergent_break = false;
bool has_divergent_continue = false;
bool has_discard = false;
uint8_t needs = 0;
unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
Block& loop_block = ctx.program->blocks[i];
needs |= ctx.info[i].block_needs;
if (loop_block.kind & block_kind_uses_discard_if ||
loop_block.kind & block_kind_discard ||
loop_block.kind & block_kind_uses_demote)
has_discard = true;
if (loop_block.loop_nest_depth != loop_nest_depth)
continue;
if (loop_block.kind & block_kind_uniform)
continue;
else if (loop_block.kind & block_kind_break)
has_divergent_break = true;
else if (loop_block.kind & block_kind_continue)
has_divergent_continue = true;
}
if (ctx.handle_wqm) {
if (needs & WQM) {
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
transition_to_WQM(ctx, bld, idx);
bld.insert(std::move(branch));
} else {
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
transition_to_Exact(ctx, bld, idx);
bld.insert(std::move(branch));
}
}
unsigned num_exec_masks = ctx.info[idx].exec.size();
if (block->kind & block_kind_top_level)
num_exec_masks = std::min(num_exec_masks, 2u);
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
num_exec_masks,
needs,
has_divergent_break,
has_divergent_continue,
has_discard);
}
if (block->kind & block_kind_discard) {
assert(block->instructions.back()->format == Format::PSEUDO_BRANCH);
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
/* create a discard_if() instruction with the exec mask as condition */
unsigned num = 0;
if (ctx.loop.size()) {
/* if we're in a loop, only discard from the outer exec masks */
num = ctx.loop.back().num_exec_masks;
} else {
num = ctx.info[idx].exec.size() - 1;
}
Temp old_exec = ctx.info[idx].exec.back().first;
Temp new_exec = bld.tmp(bld.lm);
Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
ctx.info[idx].exec.back().first = new_exec;
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[idx].exec.size() - 1)
andn2->definitions[0].setFixed(exec);
if (i == 0)
bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
}
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break)
ctx.info[idx].exec.back().first = cond;
bld.insert(std::move(branch));
/* no return here as it can be followed by a divergent break */
}
if (block->kind & block_kind_continue_or_break) {
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
block->instructions.pop_back();
bool need_parallelcopy = false;
while (!(ctx.info[idx].exec.back().second & mask_type_loop)) {
ctx.info[idx].exec.pop_back();
need_parallelcopy = true;
}
if (need_parallelcopy)
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first);
bld.branch(aco_opcode::p_cbranch_nz, bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]);
return;
}
if (block->kind & block_kind_uniform) {
Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(block->instructions.back().get());
if (branch->opcode == aco_opcode::p_branch) {
branch->target[0] = block->linear_succs[0];
} else {
branch->target[0] = block->linear_succs[1];
branch->target[1] = block->linear_succs[0];
}
return;
}
if (block->kind & block_kind_branch) {
if (ctx.handle_wqm &&
ctx.info[idx].exec.size() >= 2 &&
ctx.info[idx].exec.back().second == mask_type_exact &&
!(ctx.info[idx].block_needs & Exact_Branch) &&
ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
/* return to wqm before branching */
ctx.info[idx].exec.pop_back();
}
// orig = s_and_saveexec_b64
assert(block->linear_succs.size() == 2);
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
Temp cond = block->instructions.back()->operands[0].getTemp();
block->instructions.pop_back();
if (ctx.info[idx].block_needs & Exact_Branch)
transition_to_Exact(ctx, bld, idx);
Temp current_exec = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
Temp then_mask = bld.tmp(bld.lm);
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
ctx.info[idx].exec.back().first = old_exec;
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(then_mask, mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]);
return;
}
if (block->kind & block_kind_invert) {
// exec = s_andn2_b64 (original_exec, exec)
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_nz);
block->instructions.pop_back();
Temp then_mask = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second;
ctx.info[idx].exec.pop_back();
Temp orig_exec = ctx.info[idx].exec.back().first;
Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
bld.def(s1, scc), orig_exec, bld.exec(then_mask));
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(else_mask, mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]);
return;
}
if (block->kind & block_kind_break) {
// loop_mask = s_andn2_b64 (loop_mask, exec)
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
block->instructions.pop_back();
Temp current_exec = ctx.info[idx].exec.back().first;
Temp cond = Temp();
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, bld.exec(current_exec));
ctx.info[idx].exec[exec_idx].first = exec_mask;
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
break;
}
/* check if the successor is the merge block, otherwise set exec to 0 */
// TODO: this could be done better by directly branching to the merge block
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
return;
}
if (block->kind & block_kind_continue) {
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
block->instructions.pop_back();
Temp current_exec = ctx.info[idx].exec.back().first;
Temp cond = Temp();
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
break;
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, bld.exec(current_exec));
ctx.info[idx].exec[exec_idx].first = exec_mask;
}
assert(cond != Temp());
/* check if the successor is the merge block, otherwise set exec to 0 */
// TODO: this could be done better by directly branching to the merge block
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
return;
}
}
void process_block(exec_ctx& ctx, Block* block)
{
std::vector<aco_ptr<Instruction>> instructions;
instructions.reserve(block->instructions.size());
unsigned idx = add_coupling_code(ctx, block, instructions);
assert(block->index != ctx.program->blocks.size() - 1 ||
ctx.info[block->index].exec.size() <= 2);
process_instructions(ctx, block, instructions, idx);
block->instructions = std::move(instructions);
add_branch_code(ctx, block);
block->live_out_exec = ctx.info[block->index].exec.back().first;
}
} /* end namespace */
void insert_exec_mask(Program *program)
{
exec_ctx ctx(program);
if (program->needs_wqm && program->needs_exact)
calculate_wqm_needs(ctx);
for (Block& block : program->blocks)
process_block(ctx, &block);
}
}