2019-09-17 13:22:17 +02:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2019 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "aco_builder.h"
|
2021-06-09 15:40:03 +02:00
|
|
|
#include "aco_ir.h"
|
2021-06-09 10:14:54 +02:00
|
|
|
|
2019-10-14 17:46:02 +01:00
|
|
|
#include "util/u_math.h"
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 15:40:03 +02:00
|
|
|
#include <set>
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
namespace aco {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
enum WQMState : uint8_t {
|
|
|
|
|
Unspecified = 0,
|
|
|
|
|
Exact = 1 << 0,
|
|
|
|
|
WQM = 1 << 1, /* with control flow applied */
|
|
|
|
|
Preserve_WQM = 1 << 2,
|
|
|
|
|
Exact_Branch = 1 << 3,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
enum mask_type : uint8_t {
|
|
|
|
|
mask_type_global = 1 << 0,
|
|
|
|
|
mask_type_exact = 1 << 1,
|
|
|
|
|
mask_type_wqm = 1 << 2,
|
|
|
|
|
mask_type_loop = 1 << 3, /* active lanes of a loop */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct wqm_ctx {
|
|
|
|
|
Program* program;
|
|
|
|
|
/* state for WQM propagation */
|
|
|
|
|
std::set<unsigned> worklist;
|
|
|
|
|
std::vector<uint16_t> defined_in;
|
|
|
|
|
std::vector<bool> needs_wqm;
|
|
|
|
|
std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
|
2021-06-09 10:14:54 +02:00
|
|
|
wqm_ctx(Program* program_)
|
|
|
|
|
: program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
|
|
|
|
|
needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
for (unsigned i = 0; i < program->blocks.size(); i++)
|
|
|
|
|
worklist.insert(i);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct loop_info {
|
|
|
|
|
Block* loop_header;
|
|
|
|
|
uint16_t num_exec_masks;
|
|
|
|
|
uint8_t needs;
|
|
|
|
|
bool has_divergent_break;
|
|
|
|
|
bool has_divergent_continue;
|
2019-09-26 10:33:43 +01:00
|
|
|
bool has_discard; /* has a discard or demote */
|
2021-06-09 10:14:54 +02:00
|
|
|
loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
|
|
|
|
|
: loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
|
|
|
|
|
has_divergent_continue(cont), has_discard(discard)
|
|
|
|
|
{}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct block_info {
|
2021-06-09 10:14:54 +02:00
|
|
|
std::vector<std::pair<Operand, uint8_t>>
|
|
|
|
|
exec; /* Vector of exec masks. Either a temporary or const -1. */
|
2019-09-17 13:22:17 +02:00
|
|
|
std::vector<WQMState> instr_needs;
|
|
|
|
|
uint8_t block_needs;
|
|
|
|
|
uint8_t ever_again_needs;
|
2020-01-02 19:13:53 +00:00
|
|
|
bool logical_end_wqm;
|
2019-09-17 13:22:17 +02:00
|
|
|
/* more... */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct exec_ctx {
|
2021-06-09 10:14:54 +02:00
|
|
|
Program* program;
|
2019-09-17 13:22:17 +02:00
|
|
|
std::vector<block_info> info;
|
|
|
|
|
std::vector<loop_info> loop;
|
|
|
|
|
bool handle_wqm = false;
|
2021-06-09 10:14:54 +02:00
|
|
|
exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
|
2019-09-17 13:22:17 +02:00
|
|
|
};
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bool
|
|
|
|
|
needs_exact(aco_ptr<Instruction>& instr)
|
|
|
|
|
{
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isMUBUF()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
return instr->mubuf().disable_wqm;
|
2021-01-20 15:27:16 +00:00
|
|
|
} else if (instr->isMTBUF()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
return instr->mtbuf().disable_wqm;
|
2021-01-20 15:27:16 +00:00
|
|
|
} else if (instr->isMIMG()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
return instr->mimg().disable_wqm;
|
2021-01-20 15:27:16 +00:00
|
|
|
} else if (instr->isFlatLike()) {
|
2021-01-21 16:13:34 +00:00
|
|
|
return instr->flatlike().disable_wqm;
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2021-01-20 15:27:16 +00:00
|
|
|
return instr->isEXP();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
set_needs_wqm(wqm_ctx& ctx, Temp tmp)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (!ctx.needs_wqm[tmp.id()]) {
|
|
|
|
|
ctx.needs_wqm[tmp.id()] = true;
|
|
|
|
|
if (ctx.defined_in[tmp.id()] != 0xFFFF)
|
|
|
|
|
ctx.worklist.insert(ctx.defined_in[tmp.id()]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (ctx.branch_wqm[block_idx])
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
ctx.branch_wqm[block_idx] = true;
|
2020-12-17 16:52:24 +00:00
|
|
|
ctx.worklist.insert(block_idx);
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
Block& block = ctx.program->blocks[block_idx];
|
|
|
|
|
|
|
|
|
|
/* TODO: this sets more branch conditions to WQM than it needs to
|
|
|
|
|
* it should be enough to stop at the "exec mask top level" */
|
|
|
|
|
if (block.kind & block_kind_top_level)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
for (unsigned pred_idx : block.logical_preds)
|
|
|
|
|
mark_block_wqm(ctx, pred_idx);
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
block_info& info = exec_ctx.info[block->index];
|
|
|
|
|
|
|
|
|
|
std::vector<WQMState> instr_needs(block->instructions.size());
|
|
|
|
|
|
2019-11-11 16:21:51 +01:00
|
|
|
for (int i = block->instructions.size() - 1; i >= 0; --i) {
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_ptr<Instruction>& instr = block->instructions[i];
|
|
|
|
|
|
|
|
|
|
WQMState needs = needs_exact(instr) ? Exact : Unspecified;
|
2021-06-09 10:14:54 +02:00
|
|
|
bool propagate_wqm =
|
|
|
|
|
instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
|
2020-08-12 16:58:35 +02:00
|
|
|
bool pred_by_exec = needs_exec_mask(instr.get());
|
2019-09-17 13:22:17 +02:00
|
|
|
for (const Definition& definition : instr->definitions) {
|
|
|
|
|
if (!definition.isTemp())
|
|
|
|
|
continue;
|
|
|
|
|
const unsigned def = definition.tempId();
|
|
|
|
|
ctx.defined_in[def] = block->index;
|
|
|
|
|
if (needs == Unspecified && ctx.needs_wqm[def]) {
|
|
|
|
|
needs = pred_by_exec ? WQM : Unspecified;
|
|
|
|
|
propagate_wqm = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
if (instr->isBranch() && ctx.branch_wqm[block->index]) {
|
2021-04-07 11:51:29 +01:00
|
|
|
assert(!(info.block_needs & Exact_Branch));
|
2020-08-10 16:56:25 +02:00
|
|
|
needs = WQM;
|
|
|
|
|
propagate_wqm = true;
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (propagate_wqm) {
|
|
|
|
|
for (const Operand& op : instr->operands) {
|
|
|
|
|
if (op.isTemp()) {
|
|
|
|
|
set_needs_wqm(ctx, op.getTemp());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (preserve_wqm && info.block_needs & WQM) {
|
|
|
|
|
needs = Preserve_WQM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ensure the condition controlling the control flow for this phi is in WQM */
|
|
|
|
|
if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
|
2020-01-02 19:13:53 +00:00
|
|
|
for (unsigned pred_idx : block->logical_preds) {
|
2019-09-17 13:22:17 +02:00
|
|
|
mark_block_wqm(ctx, pred_idx);
|
2020-01-02 19:13:53 +00:00
|
|
|
exec_ctx.info[pred_idx].logical_end_wqm = true;
|
|
|
|
|
ctx.worklist.insert(pred_idx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-13 14:53:56 +00:00
|
|
|
if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) ||
|
|
|
|
|
instr->opcode == aco_opcode::p_wqm) {
|
|
|
|
|
assert(needs != Exact);
|
2020-01-02 19:13:53 +00:00
|
|
|
needs = WQM;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
instr_needs[i] = needs;
|
|
|
|
|
info.block_needs |= needs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info.instr_needs = instr_needs;
|
|
|
|
|
|
|
|
|
|
/* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
|
|
|
|
|
* <cond> should be computed in WQM */
|
|
|
|
|
if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
|
|
|
|
|
for (unsigned pred_idx : block->logical_preds)
|
|
|
|
|
mark_block_wqm(ctx, pred_idx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-07 11:51:29 +01:00
|
|
|
/* If an outer loop needs WQM but a nested loop does not, we have to ensure that
|
|
|
|
|
* the nested loop is done in WQM so that the exec is not empty upon entering
|
|
|
|
|
* the nested loop.
|
|
|
|
|
*
|
|
|
|
|
* TODO: This could be fixed with slightly better code (for loops with divergent
|
|
|
|
|
* breaks, which might benefit from being in exact) by adding Exact_Branch to a
|
|
|
|
|
* divergent branch surrounding the nested loop, if such a branch exists.
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
2021-01-15 16:00:35 +00:00
|
|
|
{
|
2021-04-07 11:51:29 +01:00
|
|
|
for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
|
|
|
|
|
Block& block = exec_ctx.program->blocks[idx];
|
|
|
|
|
if (block.kind & block_kind_break)
|
|
|
|
|
mark_block_wqm(ctx, idx);
|
2021-01-15 16:00:35 +00:00
|
|
|
|
2021-04-07 11:51:29 +01:00
|
|
|
if ((block.kind & block_kind_loop_exit) && block.loop_nest_depth == 0)
|
2021-01-15 16:00:35 +00:00
|
|
|
break;
|
|
|
|
|
}
|
2021-04-07 11:51:29 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If an outer loop and it's nested loops does not need WQM,
|
|
|
|
|
* add_branch_code() will ensure that it enters in Exact. We have to
|
|
|
|
|
* ensure that the exact exec mask is not empty by adding Exact_Branch to
|
|
|
|
|
* the outer divergent branch.
|
|
|
|
|
*/
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
2021-04-07 11:51:29 +01:00
|
|
|
{
|
2021-05-14 15:49:47 +02:00
|
|
|
assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);
|
2021-01-15 16:00:35 +00:00
|
|
|
|
|
|
|
|
int parent_branch = preheader;
|
|
|
|
|
unsigned rel_branch_depth = 0;
|
|
|
|
|
for (; parent_branch >= 0; parent_branch--) {
|
|
|
|
|
Block& branch = exec_ctx.program->blocks[parent_branch];
|
|
|
|
|
if (branch.kind & block_kind_branch) {
|
|
|
|
|
if (rel_branch_depth == 0)
|
|
|
|
|
break;
|
|
|
|
|
rel_branch_depth--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* top-level blocks should never have empty exact exec masks */
|
|
|
|
|
if (branch.kind & block_kind_top_level)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (branch.kind & block_kind_merge)
|
|
|
|
|
rel_branch_depth++;
|
|
|
|
|
}
|
|
|
|
|
assert(parent_branch >= 0);
|
|
|
|
|
|
2021-03-16 20:10:51 +01:00
|
|
|
ASSERTED Block& branch = exec_ctx.program->blocks[parent_branch];
|
2021-01-15 16:00:35 +00:00
|
|
|
assert(branch.kind & block_kind_branch);
|
|
|
|
|
if (ctx.branch_wqm[parent_branch]) {
|
|
|
|
|
/* The branch can't be done in Exact because some other blocks in it
|
2021-04-07 11:51:29 +01:00
|
|
|
* are in WQM. So instead, ensure that the loop is done in WQM. */
|
|
|
|
|
handle_wqm_loops(ctx, exec_ctx, preheader);
|
2021-01-15 16:00:35 +00:00
|
|
|
} else {
|
|
|
|
|
exec_ctx.info[parent_branch].block_needs |= Exact_Branch;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
calculate_wqm_needs(exec_ctx& exec_ctx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
wqm_ctx ctx(exec_ctx.program);
|
|
|
|
|
|
|
|
|
|
while (!ctx.worklist.empty()) {
|
|
|
|
|
unsigned block_index = *std::prev(ctx.worklist.end());
|
|
|
|
|
ctx.worklist.erase(std::prev(ctx.worklist.end()));
|
|
|
|
|
|
2021-01-15 16:00:35 +00:00
|
|
|
Block& block = exec_ctx.program->blocks[block_index];
|
|
|
|
|
get_block_needs(ctx, exec_ctx, &block);
|
|
|
|
|
|
2021-04-07 11:51:29 +01:00
|
|
|
/* handle_exact_loops() needs information on outer branches, so don't
|
|
|
|
|
* handle loops until a top-level block.
|
2021-01-15 16:00:35 +00:00
|
|
|
*/
|
|
|
|
|
if (block.kind & block_kind_top_level && block.index != exec_ctx.program->blocks.size() - 1) {
|
|
|
|
|
unsigned preheader = block.index;
|
|
|
|
|
do {
|
|
|
|
|
Block& preheader_block = exec_ctx.program->blocks[preheader];
|
|
|
|
|
if ((preheader_block.kind & block_kind_loop_preheader) &&
|
2021-04-07 11:51:29 +01:00
|
|
|
preheader_block.loop_nest_depth == 0) {
|
|
|
|
|
/* If the loop or a nested loop needs WQM, branch_wqm will be true for the
|
|
|
|
|
* preheader.
|
|
|
|
|
*/
|
|
|
|
|
if (ctx.branch_wqm[preheader])
|
|
|
|
|
handle_wqm_loops(ctx, exec_ctx, preheader);
|
|
|
|
|
else
|
|
|
|
|
handle_exact_loops(ctx, exec_ctx, preheader);
|
|
|
|
|
}
|
2021-01-15 16:00:35 +00:00
|
|
|
preheader++;
|
|
|
|
|
} while (!(exec_ctx.program->blocks[preheader].kind & block_kind_top_level));
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint8_t ever_again_needs = 0;
|
|
|
|
|
for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) {
|
|
|
|
|
exec_ctx.info[i].ever_again_needs = ever_again_needs;
|
|
|
|
|
Block& block = exec_ctx.program->blocks[i];
|
|
|
|
|
|
|
|
|
|
if (block.kind & block_kind_needs_lowering)
|
|
|
|
|
exec_ctx.info[i].block_needs |= Exact;
|
|
|
|
|
|
|
|
|
|
/* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
|
2021-06-09 10:14:54 +02:00
|
|
|
if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
|
2019-09-17 13:22:17 +02:00
|
|
|
ever_again_needs & WQM)
|
|
|
|
|
exec_ctx.info[i].block_needs |= Preserve_WQM;
|
|
|
|
|
|
|
|
|
|
ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
|
2021-06-09 10:14:54 +02:00
|
|
|
if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
|
2019-09-26 10:33:43 +01:00
|
|
|
block.kind & block_kind_uses_demote)
|
2019-09-17 13:22:17 +02:00
|
|
|
ever_again_needs |= Exact;
|
|
|
|
|
|
|
|
|
|
/* don't propagate WQM preservation further than the next top_level block */
|
|
|
|
|
if (block.kind & block_kind_top_level)
|
|
|
|
|
ever_again_needs &= ~Preserve_WQM;
|
|
|
|
|
else
|
|
|
|
|
exec_ctx.info[i].block_needs &= ~Preserve_WQM;
|
|
|
|
|
}
|
|
|
|
|
exec_ctx.handle_wqm = true;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
Operand
|
|
|
|
|
get_exec_op(Operand t)
|
2021-02-15 11:24:22 +00:00
|
|
|
{
|
2021-05-06 15:20:40 +02:00
|
|
|
if (t.isUndefined())
|
2021-02-15 11:24:22 +00:00
|
|
|
return Operand(exec, t.regClass());
|
|
|
|
|
else
|
2021-05-06 15:20:40 +02:00
|
|
|
return t;
|
2021-02-15 11:24:22 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (ctx.info[idx].exec.back().second & mask_type_wqm)
|
|
|
|
|
return;
|
|
|
|
|
if (ctx.info[idx].exec.back().second & mask_type_global) {
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand exec_mask = ctx.info[idx].exec.back().first;
|
|
|
|
|
if (exec_mask.isUndefined()) {
|
2021-02-15 11:24:22 +00:00
|
|
|
exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm));
|
|
|
|
|
ctx.info[idx].exec.back().first = exec_mask;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
|
|
|
|
|
get_exec_op(exec_mask));
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
/* otherwise, the WQM mask should be one below the current mask */
|
|
|
|
|
ctx.info[idx].exec.pop_back();
|
|
|
|
|
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
2021-05-06 15:20:40 +02:00
|
|
|
assert(ctx.info[idx].exec.back().first.isTemp());
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[idx].exec.back().first = bld.pseudo(
|
|
|
|
|
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
if (ctx.info[idx].exec.back().second & mask_type_exact)
|
|
|
|
|
return;
|
2019-09-26 15:38:09 +01:00
|
|
|
/* We can't remove the loop exec mask, because that can cause exec.size() to
|
|
|
|
|
* be less than num_exec_masks. The loop exec mask also needs to be kept
|
|
|
|
|
* around for various uses. */
|
|
|
|
|
if ((ctx.info[idx].exec.back().second & mask_type_global) &&
|
|
|
|
|
!(ctx.info[idx].exec.back().second & mask_type_loop)) {
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.pop_back();
|
|
|
|
|
assert(ctx.info[idx].exec.back().second & mask_type_exact);
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
2021-05-06 15:20:40 +02:00
|
|
|
assert(ctx.info[idx].exec.back().first.isTemp());
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[idx].exec.back().first = bld.pseudo(
|
|
|
|
|
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
/* otherwise, we create an exact mask and push to the stack */
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand wqm = ctx.info[idx].exec.back().first;
|
|
|
|
|
if (wqm.isUndefined()) {
|
2021-02-15 11:24:22 +00:00
|
|
|
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
|
|
|
|
Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
|
|
|
|
|
} else {
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
|
|
|
|
|
ctx.info[idx].exec[0].first, wqm);
|
2021-02-15 11:24:22 +00:00
|
|
|
}
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[idx].exec.back().first = Operand(wqm);
|
|
|
|
|
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
unsigned
|
|
|
|
|
add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
unsigned idx = block->index;
|
|
|
|
|
Builder bld(ctx.program, &instructions);
|
|
|
|
|
std::vector<unsigned>& preds = block->linear_preds;
|
|
|
|
|
|
|
|
|
|
/* start block */
|
|
|
|
|
if (idx == 0) {
|
|
|
|
|
aco_ptr<Instruction>& startpgm = block->instructions[0];
|
|
|
|
|
assert(startpgm->opcode == aco_opcode::p_startpgm);
|
|
|
|
|
bld.insert(std::move(startpgm));
|
|
|
|
|
|
2021-05-07 15:02:58 +02:00
|
|
|
Operand start_exec(bld.lm);
|
|
|
|
|
|
2019-10-14 17:46:02 +01:00
|
|
|
/* exec seems to need to be manually initialized with combined shaders */
|
2020-10-07 18:21:48 +02:00
|
|
|
if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) {
|
2021-07-13 11:22:46 +02:00
|
|
|
start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
|
2021-05-07 15:02:58 +02:00
|
|
|
bld.copy(Definition(exec, bld.lm), start_exec);
|
2019-10-14 17:46:02 +01:00
|
|
|
}
|
|
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.handle_wqm) {
|
2021-05-07 15:02:58 +02:00
|
|
|
ctx.info[0].exec.emplace_back(start_exec, mask_type_global | mask_type_exact);
|
2019-09-17 13:22:17 +02:00
|
|
|
/* if this block only needs WQM, initialize already */
|
|
|
|
|
if (ctx.info[0].block_needs == WQM)
|
|
|
|
|
transition_to_WQM(ctx, bld, 0);
|
|
|
|
|
} else {
|
|
|
|
|
uint8_t mask = mask_type_global;
|
|
|
|
|
if (ctx.program->needs_wqm) {
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
|
|
|
|
|
Operand(exec, bld.lm));
|
2019-09-17 13:22:17 +02:00
|
|
|
mask |= mask_type_wqm;
|
|
|
|
|
} else {
|
|
|
|
|
mask |= mask_type_exact;
|
|
|
|
|
}
|
2021-05-07 15:02:58 +02:00
|
|
|
ctx.info[0].exec.emplace_back(start_exec, mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* loop entry block */
|
|
|
|
|
if (block->kind & block_kind_loop_header) {
|
|
|
|
|
assert(preds[0] == idx - 1);
|
|
|
|
|
ctx.info[idx].exec = ctx.info[idx - 1].exec;
|
|
|
|
|
loop_info& info = ctx.loop.back();
|
|
|
|
|
while (ctx.info[idx].exec.size() > info.num_exec_masks)
|
|
|
|
|
ctx.info[idx].exec.pop_back();
|
|
|
|
|
|
|
|
|
|
/* create ssa names for outer exec masks */
|
|
|
|
|
if (info.has_discard) {
|
|
|
|
|
aco_ptr<Pseudo_instruction> phi;
|
|
|
|
|
for (int i = 0; i < info.num_exec_masks - 1; i++) {
|
2021-06-09 10:14:54 +02:00
|
|
|
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
|
|
|
|
|
Format::PSEUDO, preds.size(), 1));
|
2019-11-27 11:04:47 +01:00
|
|
|
phi->definitions[0] = bld.def(bld.lm);
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* create ssa name for restore mask */
|
|
|
|
|
if (info.has_divergent_break) {
|
|
|
|
|
/* this phi might be trivial but ensures a parallelcopy on the loop header */
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
2019-11-27 11:04:47 +01:00
|
|
|
phi->definitions[0] = bld.def(bld.lm);
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* create ssa name for loop active mask */
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
2019-09-17 13:22:17 +02:00
|
|
|
if (info.has_divergent_continue)
|
2019-11-27 11:04:47 +01:00
|
|
|
phi->definitions[0] = bld.def(bld.lm);
|
2019-09-17 13:22:17 +02:00
|
|
|
else
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->definitions[0] = Definition(exec, bld.lm);
|
|
|
|
|
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first);
|
2019-09-17 13:22:17 +02:00
|
|
|
Temp loop_active = bld.insert(std::move(phi));
|
|
|
|
|
|
|
|
|
|
if (info.has_divergent_break) {
|
2021-06-09 10:14:54 +02:00
|
|
|
uint8_t mask_type =
|
|
|
|
|
(ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.emplace_back(loop_active, mask_type);
|
|
|
|
|
} else {
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[idx].exec.back().first = Operand(loop_active);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.back().second |= mask_type_loop;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* create a parallelcopy to move the active mask to exec */
|
|
|
|
|
unsigned i = 0;
|
|
|
|
|
if (info.has_divergent_continue) {
|
|
|
|
|
while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
|
|
|
|
|
bld.insert(std::move(block->instructions[i]));
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[idx].exec.emplace_back(
|
|
|
|
|
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
|
|
|
|
ctx.info[idx].exec.back().first),
|
|
|
|
|
mask_type);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* loop exit block */
|
|
|
|
|
if (block->kind & block_kind_loop_exit) {
|
|
|
|
|
Block* header = ctx.loop.back().loop_header;
|
|
|
|
|
loop_info& info = ctx.loop.back();
|
|
|
|
|
|
|
|
|
|
for (ASSERTED unsigned pred : preds)
|
|
|
|
|
assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
|
|
|
|
|
|
|
|
|
|
/* fill the loop header phis */
|
|
|
|
|
std::vector<unsigned>& header_preds = header->linear_preds;
|
2020-11-03 14:40:05 +01:00
|
|
|
int instr_idx = 0;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (info.has_discard) {
|
2020-11-03 14:40:05 +01:00
|
|
|
while (instr_idx < info.num_exec_masks - 1) {
|
|
|
|
|
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(phi->opcode == aco_opcode::p_linear_phi);
|
|
|
|
|
for (unsigned i = 1; i < phi->operands.size(); i++)
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[instr_idx].first);
|
2020-11-03 14:40:05 +01:00
|
|
|
instr_idx++;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-11-03 14:40:05 +01:00
|
|
|
|
|
|
|
|
{
|
|
|
|
|
aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
|
|
|
|
|
assert(phi->opcode == aco_opcode::p_linear_phi);
|
|
|
|
|
for (unsigned i = 1; i < phi->operands.size(); i++)
|
2021-06-09 10:14:54 +02:00
|
|
|
phi->operands[i] =
|
|
|
|
|
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
|
2020-11-03 14:40:05 +01:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
if (info.has_divergent_break) {
|
2020-11-03 14:40:05 +01:00
|
|
|
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(phi->opcode == aco_opcode::p_linear_phi);
|
|
|
|
|
for (unsigned i = 1; i < phi->operands.size(); i++)
|
2021-06-09 10:14:54 +02:00
|
|
|
phi->operands[i] =
|
|
|
|
|
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
|
|
|
|
|
|
|
|
|
|
/* create the loop exit phis if not trivial */
|
2020-11-03 14:40:05 +01:00
|
|
|
for (unsigned exec_idx = 0; exec_idx < info.num_exec_masks; exec_idx++) {
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand same = ctx.info[preds[0]].exec[exec_idx].first;
|
2020-11-03 14:40:05 +01:00
|
|
|
uint8_t type = ctx.info[header_preds[0]].exec[exec_idx].second;
|
2019-09-17 13:22:17 +02:00
|
|
|
bool trivial = true;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < preds.size() && trivial; i++) {
|
2020-11-03 14:40:05 +01:00
|
|
|
if (ctx.info[preds[i]].exec[exec_idx].first != same)
|
2019-09-17 13:22:17 +02:00
|
|
|
trivial = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (trivial) {
|
|
|
|
|
ctx.info[idx].exec.emplace_back(same, type);
|
|
|
|
|
} else {
|
|
|
|
|
/* create phi for loop footer */
|
2021-06-09 10:14:54 +02:00
|
|
|
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
|
|
|
|
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
2019-11-27 11:04:47 +01:00
|
|
|
phi->definitions[0] = bld.def(bld.lm);
|
2020-11-03 14:40:05 +01:00
|
|
|
if (exec_idx == info.num_exec_masks - 1u) {
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->definitions[0] = Definition(exec, bld.lm);
|
2020-01-06 15:17:21 +00:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
for (unsigned i = 0; i < phi->operands.size(); i++)
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
phi->operands[i] = get_exec_op(ctx.info[preds[i]].exec[exec_idx].first);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
assert(ctx.info[idx].exec.size() == info.num_exec_masks);
|
|
|
|
|
|
|
|
|
|
/* create a parallelcopy to move the live mask to exec */
|
|
|
|
|
unsigned i = 0;
|
|
|
|
|
while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
|
|
|
|
|
bld.insert(std::move(block->instructions[i]));
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx.handle_wqm) {
|
|
|
|
|
if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
|
|
|
|
|
if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
|
|
|
|
|
(ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
|
|
|
|
|
ctx.info[idx].exec.back().second |= mask_type_global;
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
ctx.handle_wqm = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (ctx.info[idx].block_needs == WQM)
|
|
|
|
|
transition_to_WQM(ctx, bld, idx);
|
|
|
|
|
else if (ctx.info[idx].block_needs == Exact)
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
2021-02-04 16:01:44 +01:00
|
|
|
if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
|
|
|
|
|
/* move current exec mask into exec register */
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[idx].exec.back().first = bld.pseudo(
|
|
|
|
|
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
2020-01-06 15:17:21 +00:00
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
ctx.loop.pop_back();
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (preds.size() == 1) {
|
|
|
|
|
ctx.info[idx].exec = ctx.info[preds[0]].exec;
|
|
|
|
|
} else {
|
|
|
|
|
assert(preds.size() == 2);
|
|
|
|
|
/* if one of the predecessors ends in exact mask, we pop it from stack */
|
2021-06-09 10:14:54 +02:00
|
|
|
unsigned num_exec_masks =
|
|
|
|
|
std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());
|
2020-12-11 17:44:19 +01:00
|
|
|
|
|
|
|
|
if (block->kind & block_kind_merge)
|
|
|
|
|
num_exec_masks--;
|
2021-02-03 15:32:32 +01:00
|
|
|
if (block->kind & block_kind_top_level)
|
2019-09-17 13:22:17 +02:00
|
|
|
num_exec_masks = std::min(num_exec_masks, 2u);
|
|
|
|
|
|
|
|
|
|
/* create phis for diverged exec masks */
|
|
|
|
|
for (unsigned i = 0; i < num_exec_masks; i++) {
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
/* skip trivial phis */
|
|
|
|
|
if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand t = ctx.info[preds[0]].exec[i].first;
|
2021-02-15 11:31:13 +01:00
|
|
|
/* discard/demote can change the state of the current exec mask */
|
2021-06-09 10:14:54 +02:00
|
|
|
assert(!t.isTemp() ||
|
|
|
|
|
ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
|
2021-02-15 11:31:13 +01:00
|
|
|
uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
|
|
|
|
|
ctx.info[idx].exec.emplace_back(t, mask);
|
2019-09-17 13:22:17 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
|
2021-06-09 10:14:54 +02:00
|
|
|
Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
|
|
|
|
|
in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
get_exec_op(ctx.info[preds[0]].exec[i].first),
|
|
|
|
|
get_exec_op(ctx.info[preds[1]].exec[i].first));
|
2019-09-17 13:22:17 +02:00
|
|
|
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
|
|
|
|
|
ctx.info[idx].exec.emplace_back(phi, mask_type);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned i = 0;
|
|
|
|
|
while (block->instructions[i]->opcode == aco_opcode::p_phi ||
|
|
|
|
|
block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
|
|
|
|
|
bld.insert(std::move(block->instructions[i]));
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* try to satisfy the block's needs */
|
|
|
|
|
if (ctx.handle_wqm) {
|
|
|
|
|
if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
|
|
|
|
|
if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
|
|
|
|
|
(ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
|
|
|
|
|
ctx.info[idx].exec.back().second |= mask_type_global;
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
ctx.handle_wqm = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (ctx.info[idx].block_needs == WQM)
|
|
|
|
|
transition_to_WQM(ctx, bld, idx);
|
|
|
|
|
else if (ctx.info[idx].block_needs == Exact)
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-06 15:20:40 +02:00
|
|
|
if (block->kind & block_kind_merge && !ctx.info[idx].exec.back().first.isUndefined()) {
|
|
|
|
|
Operand restore = ctx.info[idx].exec.back().first;
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(restore.size() == bld.lm.size());
|
2021-05-07 15:02:58 +02:00
|
|
|
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore);
|
|
|
|
|
if (!restore.isConstant())
|
|
|
|
|
ctx.info[idx].exec.back().first = Operand(bld.lm);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
|
|
|
|
|
unsigned idx)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
WQMState state;
|
|
|
|
|
if (ctx.info[block->index].exec.back().second & mask_type_wqm)
|
|
|
|
|
state = WQM;
|
|
|
|
|
else {
|
|
|
|
|
assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
|
|
|
|
|
state = Exact;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
|
2021-06-09 10:14:54 +02:00
|
|
|
bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
|
|
|
|
|
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
|
2019-09-17 13:22:17 +02:00
|
|
|
block->kind & block_kind_uses_discard_if ||
|
2021-06-09 10:14:54 +02:00
|
|
|
block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
|
2019-09-17 13:22:17 +02:00
|
|
|
if (!process) {
|
|
|
|
|
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
|
|
|
|
|
instructions.insert(instructions.end(),
|
|
|
|
|
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
|
2021-06-09 10:14:54 +02:00
|
|
|
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
|
|
|
|
|
block->instructions.end()));
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Builder bld(ctx.program, &instructions);
|
|
|
|
|
|
|
|
|
|
for (; idx < block->instructions.size(); idx++) {
|
|
|
|
|
aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
|
|
|
|
|
|
|
|
|
|
WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified;
|
|
|
|
|
|
|
|
|
|
if (instr->opcode == aco_opcode::p_discard_if) {
|
|
|
|
|
if (ctx.info[block->index].block_needs & Preserve_WQM) {
|
|
|
|
|
assert(block->kind & block_kind_top_level);
|
|
|
|
|
transition_to_WQM(ctx, bld, block->index);
|
|
|
|
|
ctx.info[block->index].exec.back().second &= ~mask_type_global;
|
|
|
|
|
}
|
2019-10-08 13:40:17 +01:00
|
|
|
int num = ctx.info[block->index].exec.size();
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(num);
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
|
|
|
|
|
/* discard from current exec */
|
|
|
|
|
const Operand cond = instr->operands[0];
|
|
|
|
|
Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
|
2021-06-09 10:14:54 +02:00
|
|
|
Operand(exec, bld.lm), cond)
|
|
|
|
|
.def(1)
|
|
|
|
|
.getTemp();
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
|
|
|
|
|
/* discard from inner to outer exec mask on stack */
|
|
|
|
|
for (int i = num - 2; i >= 0; i--) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
2019-10-08 13:40:17 +01:00
|
|
|
ctx.info[block->index].exec[i].first, cond);
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
exit_cond = andn2->definitions[1].getTemp();
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
|
|
|
|
|
instr->opcode = aco_opcode::p_exit_early_if;
|
|
|
|
|
instr->operands[0] = bld.scc(exit_cond);
|
2019-10-08 13:40:17 +01:00
|
|
|
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
} else if (needs == WQM && state != WQM) {
|
|
|
|
|
transition_to_WQM(ctx, bld, block->index);
|
|
|
|
|
state = WQM;
|
|
|
|
|
} else if (needs == Exact && state != Exact) {
|
|
|
|
|
transition_to_Exact(ctx, bld, block->index);
|
|
|
|
|
state = Exact;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-16 10:33:25 +01:00
|
|
|
if (instr->opcode == aco_opcode::p_is_helper) {
|
2019-09-17 13:22:17 +02:00
|
|
|
Definition dst = instr->definitions[0];
|
2019-11-27 11:04:47 +01:00
|
|
|
assert(dst.size() == bld.lm.size());
|
2019-09-17 13:22:17 +02:00
|
|
|
if (state == Exact) {
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
|
|
|
|
|
Format::SOP1, 1, 1));
|
2021-07-13 11:22:46 +02:00
|
|
|
instr->operands[0] = Operand::zero();
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->definitions[0] = dst;
|
|
|
|
|
} else {
|
2021-05-06 15:20:40 +02:00
|
|
|
std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(exact_mask.second & mask_type_exact);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
|
|
|
|
|
Format::SOP2, 2, 2));
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
instr->operands[0] = Operand(exec, bld.lm); /* current exec */
|
2019-09-17 13:22:17 +02:00
|
|
|
instr->operands[1] = Operand(exact_mask.first);
|
|
|
|
|
instr->definitions[0] = dst;
|
|
|
|
|
instr->definitions[1] = bld.def(s1, scc);
|
|
|
|
|
}
|
|
|
|
|
} else if (instr->opcode == aco_opcode::p_demote_to_helper) {
|
|
|
|
|
/* turn demote into discard_if with only exact masks */
|
2021-06-09 10:14:54 +02:00
|
|
|
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
|
|
|
|
|
(mask_type_exact | mask_type_global));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-01-10 17:19:40 +01:00
|
|
|
int num;
|
|
|
|
|
Temp cond, exit_cond;
|
|
|
|
|
if (instr->operands[0].isConstant()) {
|
2020-01-21 15:34:23 +01:00
|
|
|
assert(instr->operands[0].constantValue() == -1u);
|
2019-09-17 13:22:17 +02:00
|
|
|
/* transition to exact and set exec to zero */
|
2020-01-10 17:19:40 +01:00
|
|
|
exit_cond = bld.tmp(s1);
|
2021-06-09 10:14:54 +02:00
|
|
|
cond =
|
|
|
|
|
bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
|
2021-07-13 11:22:46 +02:00
|
|
|
Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
|
2020-01-10 17:19:40 +01:00
|
|
|
|
|
|
|
|
num = ctx.info[block->index].exec.size() - 2;
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[block->index].exec.back().first = Operand(cond);
|
|
|
|
|
ctx.info[block->index].exec.emplace_back(Operand(bld.lm), mask_type_exact);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/* demote_if: transition to exact */
|
|
|
|
|
transition_to_Exact(ctx, bld, block->index);
|
|
|
|
|
assert(instr->operands[0].isTemp());
|
|
|
|
|
cond = instr->operands[0].getTemp();
|
2020-01-10 17:19:40 +01:00
|
|
|
num = ctx.info[block->index].exec.size() - 1;
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-01-10 17:19:40 +01:00
|
|
|
for (int i = num; i >= 0; i--) {
|
2019-09-17 13:22:17 +02:00
|
|
|
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
2019-10-08 13:40:17 +01:00
|
|
|
ctx.info[block->index].exec[i].first, cond);
|
2020-01-10 17:19:40 +01:00
|
|
|
if (i == (int)ctx.info[block->index].exec.size() - 1) {
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
andn2->operands[0] = Operand(exec, bld.lm);
|
|
|
|
|
andn2->definitions[0] = Definition(exec, bld.lm);
|
2019-10-08 13:40:17 +01:00
|
|
|
}
|
2020-01-10 17:19:40 +01:00
|
|
|
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
|
2020-01-10 17:19:40 +01:00
|
|
|
exit_cond = andn2->definitions[1].getTemp();
|
2019-10-08 13:40:17 +01:00
|
|
|
} else {
|
|
|
|
|
assert(i != 0);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-01-10 17:19:40 +01:00
|
|
|
instr->opcode = aco_opcode::p_exit_early_if;
|
|
|
|
|
instr->operands[0] = bld.scc(exit_cond);
|
2019-09-17 13:22:17 +02:00
|
|
|
state = Exact;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bld.insert(std::move(instr));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
add_branch_code(exec_ctx& ctx, Block* block)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
unsigned idx = block->index;
|
|
|
|
|
Builder bld(ctx.program, block);
|
|
|
|
|
|
|
|
|
|
if (idx == ctx.program->blocks.size() - 1)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* try to disable wqm handling */
|
|
|
|
|
if (ctx.handle_wqm && block->kind & block_kind_top_level) {
|
|
|
|
|
if (ctx.info[idx].exec.size() == 3) {
|
|
|
|
|
assert(ctx.info[idx].exec[1].second == mask_type_wqm);
|
|
|
|
|
ctx.info[idx].exec.pop_back();
|
|
|
|
|
}
|
|
|
|
|
assert(ctx.info[idx].exec.size() <= 2);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
|
2019-09-17 13:22:17 +02:00
|
|
|
/* transition to Exact */
|
|
|
|
|
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
ctx.info[idx].exec.back().second |= mask_type_global;
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
bld.insert(std::move(branch));
|
|
|
|
|
ctx.handle_wqm = false;
|
|
|
|
|
|
|
|
|
|
} else if (ctx.info[idx].block_needs & Preserve_WQM) {
|
|
|
|
|
/* transition to WQM and remove global flag */
|
|
|
|
|
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
transition_to_WQM(ctx, bld, idx);
|
|
|
|
|
ctx.info[idx].exec.back().second &= ~mask_type_global;
|
|
|
|
|
bld.insert(std::move(branch));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_loop_preheader) {
|
|
|
|
|
/* collect information about the succeeding loop */
|
|
|
|
|
bool has_divergent_break = false;
|
|
|
|
|
bool has_divergent_continue = false;
|
|
|
|
|
bool has_discard = false;
|
|
|
|
|
uint8_t needs = 0;
|
|
|
|
|
unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
|
|
|
|
|
|
|
|
|
|
for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
|
|
|
|
|
Block& loop_block = ctx.program->blocks[i];
|
|
|
|
|
needs |= ctx.info[i].block_needs;
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
|
2019-09-26 10:33:43 +01:00
|
|
|
loop_block.kind & block_kind_uses_demote)
|
2019-09-17 13:22:17 +02:00
|
|
|
has_discard = true;
|
|
|
|
|
if (loop_block.loop_nest_depth != loop_nest_depth)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (loop_block.kind & block_kind_uniform)
|
|
|
|
|
continue;
|
|
|
|
|
else if (loop_block.kind & block_kind_break)
|
|
|
|
|
has_divergent_break = true;
|
|
|
|
|
else if (loop_block.kind & block_kind_continue)
|
|
|
|
|
has_divergent_continue = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ctx.handle_wqm) {
|
|
|
|
|
if (needs & WQM) {
|
|
|
|
|
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
transition_to_WQM(ctx, bld, idx);
|
|
|
|
|
bld.insert(std::move(branch));
|
|
|
|
|
} else {
|
|
|
|
|
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
bld.insert(std::move(branch));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned num_exec_masks = ctx.info[idx].exec.size();
|
|
|
|
|
if (block->kind & block_kind_top_level)
|
|
|
|
|
num_exec_masks = std::min(num_exec_masks, 2u);
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
|
|
|
|
|
has_divergent_break, has_divergent_continue, has_discard);
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2020-08-26 11:10:37 +01:00
|
|
|
/* For normal breaks, this is the exec mask. For discard+break, it's the
|
|
|
|
|
* old exec mask before it was zero'd.
|
|
|
|
|
*/
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
Operand break_cond = Operand(exec, bld.lm);
|
2020-08-26 11:10:37 +01:00
|
|
|
|
2019-09-17 13:22:17 +02:00
|
|
|
if (block->kind & block_kind_discard) {
|
|
|
|
|
|
2021-01-20 15:27:16 +00:00
|
|
|
assert(block->instructions.back()->isBranch());
|
2019-09-17 13:22:17 +02:00
|
|
|
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
|
|
|
|
|
/* create a discard_if() instruction with the exec mask as condition */
|
|
|
|
|
unsigned num = 0;
|
|
|
|
|
if (ctx.loop.size()) {
|
|
|
|
|
/* if we're in a loop, only discard from the outer exec masks */
|
|
|
|
|
num = ctx.loop.back().num_exec_masks;
|
|
|
|
|
} else {
|
|
|
|
|
num = ctx.info[idx].exec.size() - 1;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-27 11:04:47 +01:00
|
|
|
Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
2021-07-13 11:22:46 +02:00
|
|
|
Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2019-10-08 13:40:17 +01:00
|
|
|
for (int i = num - 1; i >= 0; i--) {
|
2021-06-09 10:14:54 +02:00
|
|
|
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
get_exec_op(ctx.info[block->index].exec[i].first), cond);
|
2019-11-18 21:00:17 +00:00
|
|
|
if (i == (int)ctx.info[idx].exec.size() - 1)
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
andn2->definitions[0] = Definition(exec, bld.lm);
|
2019-10-08 13:40:17 +01:00
|
|
|
if (i == 0)
|
|
|
|
|
bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
|
|
|
|
|
|
2020-08-26 11:10:37 +01:00
|
|
|
break_cond = Operand(cond);
|
2019-09-17 13:22:17 +02:00
|
|
|
bld.insert(std::move(branch));
|
|
|
|
|
/* no return here as it can be followed by a divergent break */
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_continue_or_break) {
|
2021-06-09 10:14:54 +02:00
|
|
|
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
|
|
|
|
|
block_kind_loop_header);
|
|
|
|
|
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
|
|
|
|
|
block_kind_loop_exit);
|
2019-09-17 13:22:17 +02:00
|
|
|
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
|
2020-01-20 17:40:13 +00:00
|
|
|
bool need_parallelcopy = false;
|
|
|
|
|
while (!(ctx.info[idx].exec.back().second & mask_type_loop)) {
|
2019-12-04 13:41:37 +01:00
|
|
|
ctx.info[idx].exec.pop_back();
|
2020-01-20 17:40:13 +00:00
|
|
|
need_parallelcopy = true;
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2020-01-20 17:40:13 +00:00
|
|
|
if (need_parallelcopy)
|
2021-06-09 10:14:54 +02:00
|
|
|
ctx.info[idx].exec.back().first = bld.pseudo(
|
|
|
|
|
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
|
|
|
|
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
|
|
|
|
block->linear_succs[1], block->linear_succs[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_uniform) {
|
2021-01-21 16:13:34 +00:00
|
|
|
Pseudo_branch_instruction& branch = block->instructions.back()->branch();
|
|
|
|
|
if (branch.opcode == aco_opcode::p_branch) {
|
|
|
|
|
branch.target[0] = block->linear_succs[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
} else {
|
2021-01-21 16:13:34 +00:00
|
|
|
branch.target[0] = block->linear_succs[1];
|
|
|
|
|
branch.target[1] = block->linear_succs[0];
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_branch) {
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec.back().second == mask_type_exact &&
|
|
|
|
|
!(ctx.info[idx].block_needs & Exact_Branch) &&
|
|
|
|
|
ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
|
|
|
|
|
/* return to wqm before branching */
|
|
|
|
|
ctx.info[idx].exec.pop_back();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// orig = s_and_saveexec_b64
|
|
|
|
|
assert(block->linear_succs.size() == 2);
|
|
|
|
|
assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
|
|
|
|
|
Temp cond = block->instructions.back()->operands[0].getTemp();
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
|
|
|
|
|
if (ctx.info[idx].block_needs & Exact_Branch)
|
|
|
|
|
transition_to_Exact(ctx, bld, idx);
|
|
|
|
|
|
|
|
|
|
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
aco: Don't use s_and_saveexec with branches when exec is constant.
When exec is constant, we can remember the constant as the old exec,
and just copy the condition and use it as the new exec. There is no
need to save the constant.
Due to using p_parallelcopy which is lowered to s_mov_b64 (or 32),
many exec restores now become copies, hence the increase in the copy
stats.
Fossil DB changes on Sienna Cichlid:
Totals from 73969 (49.37% of 149839) affected shaders:
SpillSGPRs: 1768 -> 1610 (-8.94%)
CodeSize: 99053892 -> 99047884 (-0.01%); split: -0.02%, +0.01%
Instrs: 19372852 -> 19370398 (-0.01%); split: -0.02%, +0.01%
VClause: 515154 -> 515142 (-0.00%); split: -0.00%, +0.00%
SClause: 719236 -> 718395 (-0.12%); split: -0.14%, +0.02%
Copies: 1109770 -> 1254634 (+13.05%); split: -0.07%, +13.12%
Branches: 374338 -> 374348 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 1776481 -> 1653761 (-6.91%)
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10691>
2021-05-07 15:03:36 +02:00
|
|
|
if (ctx.info[idx].exec.back().first.constantEquals(-1u)) {
|
|
|
|
|
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
|
|
|
|
|
} else {
|
|
|
|
|
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
2021-06-09 10:14:54 +02:00
|
|
|
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
aco: Don't use s_and_saveexec with branches when exec is constant.
When exec is constant, we can remember the constant as the old exec,
and just copy the condition and use it as the new exec. There is no
need to save the constant.
Due to using p_parallelcopy which is lowered to s_mov_b64 (or 32),
many exec restores now become copies, hence the increase in the copy
stats.
Fossil DB changes on Sienna Cichlid:
Totals from 73969 (49.37% of 149839) affected shaders:
SpillSGPRs: 1768 -> 1610 (-8.94%)
CodeSize: 99053892 -> 99047884 (-0.01%); split: -0.02%, +0.01%
Instrs: 19372852 -> 19370398 (-0.01%); split: -0.02%, +0.01%
VClause: 515154 -> 515142 (-0.00%); split: -0.00%, +0.00%
SClause: 719236 -> 718395 (-0.12%); split: -0.14%, +0.02%
Copies: 1109770 -> 1254634 (+13.05%); split: -0.07%, +13.12%
Branches: 374338 -> 374348 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 1776481 -> 1653761 (-6.91%)
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10691>
2021-05-07 15:03:36 +02:00
|
|
|
ctx.info[idx].exec.back().first = Operand(old_exec);
|
|
|
|
|
}
|
2019-09-17 13:22:17 +02:00
|
|
|
|
|
|
|
|
/* add next current exec to the stack */
|
2021-05-06 15:20:40 +02:00
|
|
|
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
|
|
|
|
block->linear_succs[1], block->linear_succs[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_invert) {
|
|
|
|
|
// exec = s_andn2_b64 (original_exec, exec)
|
2021-05-19 15:42:10 +02:00
|
|
|
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
2019-09-17 13:22:17 +02:00
|
|
|
block->instructions.pop_back();
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
assert(ctx.info[idx].exec.size() >= 2);
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
|
|
|
|
|
Operand(exec, bld.lm));
|
2019-09-17 13:22:17 +02:00
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
|
|
|
|
block->linear_succs[1], block->linear_succs[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_break) {
|
|
|
|
|
// loop_mask = s_andn2_b64 (loop_mask, exec)
|
|
|
|
|
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
|
|
|
|
|
Temp cond = Temp();
|
|
|
|
|
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
|
|
|
|
|
cond = bld.tmp(s1);
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand exec_mask = ctx.info[idx].exec[exec_idx].first;
|
2019-11-27 11:04:47 +01:00
|
|
|
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
|
2020-08-26 11:10:37 +01:00
|
|
|
exec_mask, break_cond);
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec[exec_idx].first = exec_mask;
|
|
|
|
|
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* check if the successor is the merge block, otherwise set exec to 0 */
|
|
|
|
|
// TODO: this could be done better by directly branching to the merge block
|
|
|
|
|
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
|
|
|
|
|
Block& succ = ctx.program->blocks[succ_idx];
|
|
|
|
|
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
|
|
|
|
|
block->linear_succs[1], block->linear_succs[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->kind & block_kind_continue) {
|
|
|
|
|
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
|
|
|
|
block->instructions.pop_back();
|
|
|
|
|
|
|
|
|
|
Temp cond = Temp();
|
|
|
|
|
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
|
|
|
|
|
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
|
|
|
|
|
break;
|
|
|
|
|
cond = bld.tmp(s1);
|
2021-05-06 15:20:40 +02:00
|
|
|
Operand exec_mask = ctx.info[idx].exec[exec_idx].first;
|
2019-11-27 11:04:47 +01:00
|
|
|
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
|
aco: make all exec accesses non-temporaries
So that they are not counted into the register demand.
Totals from 107336 (77.00% of 139391) affected shaders (Navi10):
VGPRs: 4023452 -> 4023248 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 14088 -> 12571 (-10.77%); split: -11.03%, +0.26%
CodeSize: 266816164 -> 266765528 (-0.02%); split: -0.04%, +0.02%
MaxWaves: 1553339 -> 1553374 (+0.00%); split: +0.00%, -0.00%
Instrs: 50977701 -> 50973093 (-0.01%); split: -0.02%, +0.01%
Cycles: 1733911128 -> 1733605320 (-0.02%); split: -0.05%, +0.03%
VMEM: 40867650 -> 40900204 (+0.08%); split: +0.13%, -0.05%
SMEM: 6835980 -> 6829073 (-0.10%); split: +0.10%, -0.20%
VClause: 1032783 -> 1032788 (+0.00%); split: -0.01%, +0.01%
SClause: 2103705 -> 2104115 (+0.02%); split: -0.09%, +0.11%
Copies: 3195658 -> 3193656 (-0.06%); split: -0.30%, +0.24%
Branches: 1140213 -> 1140120 (-0.01%); split: -0.05%, +0.04%
PreSGPRs: 3603785 -> 3437064 (-4.63%); split: -5.13%, +0.50%
PreVGPRs: 3321996 -> 3321850 (-0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8870>
2021-02-03 15:44:49 +01:00
|
|
|
exec_mask, Operand(exec, bld.lm));
|
2019-09-17 13:22:17 +02:00
|
|
|
ctx.info[idx].exec[exec_idx].first = exec_mask;
|
|
|
|
|
}
|
|
|
|
|
assert(cond != Temp());
|
|
|
|
|
|
|
|
|
|
/* check if the successor is the merge block, otherwise set exec to 0 */
|
|
|
|
|
// TODO: this could be done better by directly branching to the merge block
|
|
|
|
|
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
|
|
|
|
|
Block& succ = ctx.program->blocks[succ_idx];
|
|
|
|
|
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
|
2021-07-13 11:22:46 +02:00
|
|
|
bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
|
2019-09-17 13:22:17 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
|
|
|
|
|
block->linear_succs[1], block->linear_succs[0]);
|
2019-09-17 13:22:17 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
process_block(exec_ctx& ctx, Block* block)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
std::vector<aco_ptr<Instruction>> instructions;
|
|
|
|
|
instructions.reserve(block->instructions.size());
|
|
|
|
|
|
|
|
|
|
unsigned idx = add_coupling_code(ctx, block, instructions);
|
|
|
|
|
|
|
|
|
|
assert(block->index != ctx.program->blocks.size() - 1 ||
|
|
|
|
|
ctx.info[block->index].exec.size() <= 2);
|
|
|
|
|
|
|
|
|
|
process_instructions(ctx, block, instructions, idx);
|
|
|
|
|
|
|
|
|
|
block->instructions = std::move(instructions);
|
|
|
|
|
|
|
|
|
|
add_branch_code(ctx, block);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} /* end namespace */
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
void
|
|
|
|
|
insert_exec_mask(Program* program)
|
2019-09-17 13:22:17 +02:00
|
|
|
{
|
|
|
|
|
exec_ctx ctx(program);
|
|
|
|
|
|
|
|
|
|
if (program->needs_wqm && program->needs_exact)
|
|
|
|
|
calculate_wqm_needs(ctx);
|
|
|
|
|
|
|
|
|
|
for (Block& block : program->blocks)
|
|
|
|
|
process_block(ctx, &block);
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-09 10:14:54 +02:00
|
|
|
} // namespace aco
|