aco: skip code if exec is empty

This is safer and potentially faster.

fossil-db (navi21):
Totals from 690 (0.87% of 79395) affected shaders:
Instrs: 4534778 -> 4535916 (+0.03%)
CodeSize: 25268516 -> 25272080 (+0.01%); split: -0.00%, +0.01%
Latency: 48482721 -> 48513907 (+0.06%); split: -0.00%, +0.07%
InvThroughput: 13213965 -> 13217828 (+0.03%); split: -0.00%, +0.03%
Copies: 432307 -> 432295 (-0.00%); split: -0.05%, +0.04%
Branches: 187305 -> 188249 (+0.50%)
VALU: 2904490 -> 2904508 (+0.00%); split: -0.00%, +0.00%
SALU: 674962 -> 675133 (+0.03%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31143>
This commit is contained in:
Rhys Perry 2024-07-19 20:03:43 +01:00 committed by Marge Bot
parent f00c3a14c0
commit f35e229fae
4 changed files with 165 additions and 51 deletions

View file

@ -56,6 +56,7 @@ This repairs SSA in the case of mismatches between the logical and linear CFG, w
Instruction selection might create mismatches between the logical CFG (the input NIR's CFG) and the linear CFG in the following situations:
- We add a break at the end of a loop in case it has no active invocations (an empty exec can prevent any logical breaks from being taken). This creates a linear edge but no logical edge, and SGPR uses outside the loop can require a phi.
- We add an empty exec skip over a block. This is a branch which skips most contents of a sequence of instructions if exec is empty. To avoid critical edges, the inside of the construct logically dominates the merge but not linearly.
- An SGPR is defined in one side of a divergent IF but it used in or after the merge block. If the other side of the IF ends in a branch, a phi is not necessary according to the logical CFG, but it is for the linear CFG. However, `sanitize_cf_list()` should already resolve this before translation from NIR for additional reasons.
#### Lower Phis

View file

@ -50,22 +50,6 @@ _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* i
free(out);
}
struct if_context {
Temp cond;
bool divergent_old;
bool had_divergent_discard_old;
bool had_divergent_discard_then;
bool has_divergent_continue_old;
bool has_divergent_continue_then;
struct exec_info exec_old;
unsigned BB_if_idx;
unsigned invert_idx;
Block BB_invert;
Block BB_endif;
};
struct loop_context {
Block loop_exit;
@ -8115,6 +8099,10 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
emit_split_vector(ctx, dst, instr->def.num_components);
}
static void begin_empty_exec_skip(isel_context* ctx, nir_instr* instr, nir_block* block);
static void end_empty_exec_skip(isel_context* ctx);
void
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
{
@ -8995,11 +8983,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
}
bld.pseudo(aco_opcode::p_discard_if, cond);
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
ctx->cf_info.exec.potentially_empty_discard = true;
ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
ctx->block->kind |= block_kind_uses_discard;
if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) {
ctx->cf_info.exec.potentially_empty_discard = true;
begin_empty_exec_skip(ctx, &instr->instr, instr->instr.block);
}
ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
ctx->program->needs_exact = true;
break;
}
@ -10128,7 +10118,6 @@ visit_undef(isel_context* ctx, nir_undef_instr* instr)
void
begin_loop(isel_context* ctx, loop_context* lc)
{
// TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true
append_logical_end(ctx->block);
ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
Builder bld(ctx->program, ctx->block);
@ -10338,6 +10327,8 @@ emit_loop_continue(isel_context* ctx)
void
visit_jump(isel_context* ctx, nir_jump_instr* instr)
{
end_empty_exec_skip(ctx);
switch (instr->type) {
case nir_jump_break: emit_loop_break(ctx); break;
case nir_jump_continue: emit_loop_continue(ctx); break;
@ -10380,6 +10371,12 @@ visit_block(isel_context* ctx, nir_block* block)
ctx->unended_linear_vgprs.clear();
}
nir_foreach_phi (instr, block)
visit_phi(ctx, instr);
nir_phi_instr* last_phi = nir_block_last_phi_instr(block);
begin_empty_exec_skip(ctx, last_phi ? &last_phi->instr : NULL, block);
ctx->block->instructions.reserve(ctx->block->instructions.size() +
exec_list_length(&block->instr_list) * 2);
nir_foreach_instr (instr, block) {
@ -10388,7 +10385,7 @@ visit_block(isel_context* ctx, nir_block* block)
case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
case nir_instr_type_phi: break;
case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
case nir_instr_type_deref: break;
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
@ -10399,8 +10396,8 @@ visit_block(isel_context* ctx, nir_block* block)
}
static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
static void end_uniform_if(isel_context* ctx, if_context* ic);
static void begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else = true);
static void end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else = true);
static void
visit_loop(isel_context* ctx, nir_loop* loop)
@ -10418,8 +10415,6 @@ static void
begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
nir_selection_control sel_ctrl = nir_selection_control_none)
{
ic->cond = cond;
append_logical_end(ctx->block);
ctx->block->kind |= block_kind_branch;
@ -10568,7 +10563,9 @@ end_divergent_if(isel_context* ctx, if_context* ic)
static void
begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
{
assert(cond.regClass() == s1);
assert(!cond.id() || cond.regClass() == s1);
ic->cond = cond;
append_logical_end(ctx->block);
ctx->block->kind |= block_kind_uniform;
@ -10577,8 +10574,13 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
branch->operands[0] = Operand(cond);
branch->operands[0].setPrecolored(scc);
if (cond.id()) {
branch->operands[0] = Operand(cond);
branch->operands[0].setPrecolored(scc);
} else {
branch->operands[0] = Operand(exec, ctx->program->lane_mask);
branch->branch().rarely_taken = true;
}
ctx->block->instructions.emplace_back(std::move(branch));
ic->BB_if_idx = ctx->block->index;
@ -10592,7 +10594,8 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue;
/** emit then block */
ctx->program->next_uniform_if_depth++;
if (ic->cond.id())
ctx->program->next_uniform_if_depth++;
Block* BB_then = ctx->program->create_and_insert_block();
add_edge(ic->BB_if_idx, BB_then);
append_logical_start(BB_then);
@ -10600,7 +10603,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
}
static void
begin_uniform_if_else(isel_context* ctx, if_context* ic)
begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else)
{
Block* BB_then = ctx->block;
@ -10628,25 +10631,30 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic)
/** emit else block */
Block* BB_else = ctx->program->create_and_insert_block();
add_edge(ic->BB_if_idx, BB_else);
append_logical_start(BB_else);
if (logical_else) {
add_edge(ic->BB_if_idx, BB_else);
append_logical_start(BB_else);
} else {
add_linear_edge(ic->BB_if_idx, BB_else);
}
ctx->block = BB_else;
}
static void
end_uniform_if(isel_context* ctx, if_context* ic)
end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else)
{
Block* BB_else = ctx->block;
if (!ctx->cf_info.has_branch) {
append_logical_end(BB_else);
if (logical_else)
append_logical_end(BB_else);
/* branch from then block to endif block */
aco_ptr<Instruction> branch;
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
BB_else->instructions.emplace_back(std::move(branch));
add_linear_edge(BB_else->index, &ic->BB_endif);
if (!ctx->cf_info.parent_loop.has_divergent_branch)
if (logical_else && !ctx->cf_info.parent_loop.has_divergent_branch)
add_logical_edge(BB_else->index, &ic->BB_endif);
BB_else->kind |= block_kind_uniform;
}
@ -10657,7 +10665,8 @@ end_uniform_if(isel_context* ctx, if_context* ic)
ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then;
/** emit endif merge block */
ctx->program->next_uniform_if_depth--;
if (ic->cond.id())
ctx->program->next_uniform_if_depth--;
ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
append_logical_start(ctx->block);
@ -10665,6 +10674,74 @@ end_uniform_if(isel_context* ctx, if_context* ic)
assert(!ctx->block->logical_preds.empty());
}
static void
end_empty_exec_skip(isel_context* ctx)
{
if (ctx->cf_info.skipping_empty_exec) {
begin_uniform_if_else(ctx, &ctx->cf_info.empty_exec_skip, false);
end_uniform_if(ctx, &ctx->cf_info.empty_exec_skip, false);
ctx->cf_info.skipping_empty_exec = false;
ctx->cf_info.exec.combine(ctx->cf_info.empty_exec_skip.exec_old);
}
}
/*
* If necessary, begin a branch which skips over instructions if exec is empty.
*
* The linear CFG:
* BB_IF
* / \
* BB_THEN (logical) BB_ELSE (linear)
* \ /
* BB_ENDIF
*
* The logical CFG:
* BB_IF
* |
* BB_THEN (logical)
* |
* BB_ENDIF
*
* BB_THEN should not end with a branch, since that would make BB_ENDIF unreachable.
*/
static void
begin_empty_exec_skip(isel_context* ctx, nir_instr* after_instr, nir_block* block)
{
if (!ctx->cf_info.exec.potentially_empty_discard && !ctx->cf_info.exec.potentially_empty_break &&
!ctx->cf_info.exec.potentially_empty_continue)
return;
assert(!(ctx->block->kind & block_kind_top_level));
bool further_cf_empty = !nir_cf_node_next(&block->cf_node);
bool rest_of_block_empty = false;
if (after_instr) {
rest_of_block_empty =
nir_instr_is_last(after_instr) || nir_instr_next(after_instr)->type == nir_instr_type_jump;
} else {
rest_of_block_empty = exec_list_is_empty(&block->instr_list) ||
nir_block_first_instr(block)->type == nir_instr_type_jump;
}
assert(!(ctx->block->kind & block_kind_export_end) || rest_of_block_empty);
if (rest_of_block_empty && further_cf_empty)
return;
/* Don't nest these skipping branches. It is not worth the complexity. */
end_empty_exec_skip(ctx);
begin_uniform_if_then(ctx, &ctx->cf_info.empty_exec_skip, Temp());
ctx->cf_info.skipping_empty_exec = true;
ctx->cf_info.empty_exec_skip.exec_old = ctx->cf_info.exec;
ctx->cf_info.exec = exec_info();
ctx->program->should_repair_ssa = true;
}
static void
visit_if(isel_context* ctx, nir_if* if_stmt)
{
@ -10739,6 +10816,13 @@ visit_if(isel_context* ctx, nir_if* if_stmt)
static void
visit_cf_list(isel_context* ctx, struct exec_list* list)
{
if (nir_cf_list_is_empty_block(list))
return;
bool skipping_empty_exec_old = ctx->cf_info.skipping_empty_exec;
if_context empty_exec_skip_old = std::move(ctx->cf_info.empty_exec_skip);
ctx->cf_info.skipping_empty_exec = false;
foreach_list_typed (nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
@ -10747,6 +10831,10 @@ visit_cf_list(isel_context* ctx, struct exec_list* list)
default: unreachable("unimplemented cf list type");
}
}
end_empty_exec_skip(ctx);
ctx->cf_info.skipping_empty_exec = skipping_empty_exec_old;
ctx->cf_info.empty_exec_skip = std::move(empty_exec_skip_old);
}
static void

View file

@ -60,6 +60,22 @@ struct exec_info {
}
};
struct if_context {
Temp cond;
bool divergent_old;
bool had_divergent_discard_old;
bool had_divergent_discard_then;
bool has_divergent_continue_old;
bool has_divergent_continue_then;
struct exec_info exec_old;
unsigned BB_if_idx;
unsigned invert_idx;
Block BB_invert;
Block BB_endif;
};
struct isel_context {
const struct aco_compiler_options* options;
const struct ac_shader_args* args;
@ -85,6 +101,9 @@ struct isel_context {
bool had_divergent_discard = false;
struct exec_info exec;
bool skipping_empty_exec = false;
if_context empty_exec_skip;
} cf_info;
/* NIR range analysis. */

View file

@ -772,7 +772,10 @@ BEGIN_TEST(isel.cf.uniform_if_branch_use)
{
/* The contents of this branch is moved to the merge block. */
//>> BB14
//! /* logical preds: BB13, / linear preds: BB12, BB13, / kind: uniform, continue, */
//! /* logical preds: BB13, / linear preds: BB12, BB13, / kind: uniform, */
//>> s2: %_ = p_cbranch_z %0:exec rarely_taken
//! BB15
//! /* logical preds: BB14, / linear preds: BB14, / kind: uniform, */
//! p_logical_start
//! s1: %val = p_unit_test 0
val = nir_unit_test_uniform_amd(nb, 1, 32, .base=0);
@ -781,6 +784,9 @@ BEGIN_TEST(isel.cf.uniform_if_branch_use)
//! p_unit_test 1, %val
nir_unit_test_amd(nb, val, .base=1);
//>> BB17
//! /* logical preds: BB15, / linear preds: BB15, BB16, / kind: uniform, continue, */
}
nir_pop_loop(nb, NULL);
@ -944,8 +950,8 @@ END_TEST
/**
* loop {
* use(phi(, a))
* terminate_if
* loop {
* terminate_if
* if (uniform) {
* a = loop_invariant_sgpr
* break
@ -967,7 +973,7 @@ BEGIN_TEST(isel.cf.hidden_break_no_lcssa_header_phi)
nir_loop *loop = nir_push_loop(nb);
{
//>> BB1
//! /* logical preds: BB0, BB11, / linear preds: BB0, BB13, / kind: uniform, loop-preheader, loop-header, discard, */
//! /* logical preds: BB0, BB17, / linear preds: BB0, BB19, / kind: uniform, loop-preheader, loop-header, */
//! s1: %phi = p_linear_phi %init, %val_lcssa
//! p_logical_start
//! p_unit_test 1, %phi
@ -976,31 +982,31 @@ BEGIN_TEST(isel.cf.hidden_break_no_lcssa_header_phi)
nir_phi_instr_add_src(phi, init->parent_instr->block, init);
nir_unit_test_amd(nb, &phi->def, .base = 1);
nir_terminate_if(nb, nir_unit_test_divergent_amd(nb, 1, 1, .base = 2));
//>> BB2
//! /* logical preds: BB1, BB5, / linear preds: BB1, BB7, / kind: uniform, loop-header, */
//! /* logical preds: BB1, BB8, / linear preds: BB1, BB10, / kind: uniform, loop-header, discard, */
nir_push_loop(nb);
{
nir_terminate_if(nb, nir_unit_test_divergent_amd(nb, 1, 1, .base = 2));
nir_push_if(nb, nir_unit_test_uniform_amd(nb, 1, 1, .base = 3));
{
//>> BB3
//! /* logical preds: BB2, / linear preds: BB2, / kind: uniform, break, */
//>> BB4
//! /* logical preds: BB3, / linear preds: BB3, / kind: uniform, break, */
//! p_logical_start
//! s1: %val = p_parallelcopy 0
val = nir_imm_zero(nb, 1, 32);
nir_jump(nb, nir_jump_break);
}
nir_pop_if(nb, NULL);
//>> BB5
//! /* logical preds: BB4, / linear preds: BB4, / kind: uniform, continue_or_break, */
//>> BB8
//! /* logical preds: BB6, / linear preds: BB6, BB7, / kind: uniform, continue_or_break, */
}
nir_pop_loop(nb, NULL);
//>> BB8
//! /* logical preds: BB3, / linear preds: BB3, BB6, / kind: uniform, loop-exit, */
//! s1: %val_lcssa = p_linear_phi %val, s1: undef
//>> BB11
//! /* logical preds: BB10, / linear preds: BB10, / kind: uniform, continue_or_break, */
//! /* logical preds: BB4, / linear preds: BB4, BB9, / kind: uniform, loop-exit, */
//! s1: %val_lcssa = p_linear_phi %val, s1: undef
//>> BB17
//! /* logical preds: BB15, / linear preds: BB15, BB16, / kind: uniform, continue_or_break, */
nir_phi_instr_add_src(phi, nir_cursor_current_block(nb->cursor), val);
}