From f35e229faedb5e19da49846eed119d82228c5d5a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 19 Jul 2024 20:03:43 +0100 Subject: [PATCH] aco: skip code if exec is empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is safer and potentially faster. fossil-db (navi21): Totals from 690 (0.87% of 79395) affected shaders: Instrs: 4534778 -> 4535916 (+0.03%) CodeSize: 25268516 -> 25272080 (+0.01%); split: -0.00%, +0.01% Latency: 48482721 -> 48513907 (+0.06%); split: -0.00%, +0.07% InvThroughput: 13213965 -> 13217828 (+0.03%); split: -0.00%, +0.03% Copies: 432307 -> 432295 (-0.00%); split: -0.05%, +0.04% Branches: 187305 -> 188249 (+0.50%) VALU: 2904490 -> 2904508 (+0.00%); split: -0.00%, +0.00% SALU: 674962 -> 675133 (+0.03%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/README.md | 1 + .../compiler/aco_instruction_selection.cpp | 162 ++++++++++++++---- src/amd/compiler/aco_instruction_selection.h | 19 ++ src/amd/compiler/tests/test_isel.cpp | 34 ++-- 4 files changed, 165 insertions(+), 51 deletions(-) diff --git a/src/amd/compiler/README.md b/src/amd/compiler/README.md index b833fc18ce3..16904625ef0 100644 --- a/src/amd/compiler/README.md +++ b/src/amd/compiler/README.md @@ -56,6 +56,7 @@ This repairs SSA in the case of mismatches between the logical and linear CFG, w Instruction selection might create mismatches between the logical CFG (the input NIR's CFG) and the linear CFG in the following situations: - We add a break at the end of a loop in case it has no active invocations (an empty exec can prevent any logical breaks from being taken). This creates a linear edge but no logical edge, and SGPR uses outside the loop can require a phi. +- We add an empty exec skip over a block. This is a branch which skips most contents of a sequence of instructions if exec is empty. To avoid critical edges, the inside of the construct logically dominates the merge but not linearly. - An SGPR is defined in one side of a divergent IF but it used in or after the merge block. If the other side of the IF ends in a branch, a phi is not necessary according to the logical CFG, but it is for the linear CFG. However, `sanitize_cf_list()` should already resolve this before translation from NIR for additional reasons. #### Lower Phis diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 0de27a778b0..2b1e8609ffc 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -50,22 +50,6 @@ _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* i free(out); } -struct if_context { - Temp cond; - - bool divergent_old; - bool had_divergent_discard_old; - bool had_divergent_discard_then; - bool has_divergent_continue_old; - bool has_divergent_continue_then; - struct exec_info exec_old; - - unsigned BB_if_idx; - unsigned invert_idx; - Block BB_invert; - Block BB_endif; -}; - struct loop_context { Block loop_exit; @@ -8115,6 +8099,10 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr) emit_split_vector(ctx, dst, instr->def.num_components); } +static void begin_empty_exec_skip(isel_context* ctx, nir_instr* instr, nir_block* block); + +static void end_empty_exec_skip(isel_context* ctx); + void visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) { @@ -8995,11 +8983,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) } bld.pseudo(aco_opcode::p_discard_if, cond); - - if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) - ctx->cf_info.exec.potentially_empty_discard = true; - ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx); ctx->block->kind |= block_kind_uses_discard; + + if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) { + ctx->cf_info.exec.potentially_empty_discard = true; + begin_empty_exec_skip(ctx, &instr->instr, instr->instr.block); + } + ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx); ctx->program->needs_exact = true; break; } @@ -10128,7 +10118,6 @@ visit_undef(isel_context* ctx, nir_undef_instr* instr) void begin_loop(isel_context* ctx, loop_context* lc) { - // TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true append_logical_end(ctx->block); ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; Builder bld(ctx->program, ctx->block); @@ -10338,6 +10327,8 @@ emit_loop_continue(isel_context* ctx) void visit_jump(isel_context* ctx, nir_jump_instr* instr) { + end_empty_exec_skip(ctx); + switch (instr->type) { case nir_jump_break: emit_loop_break(ctx); break; case nir_jump_continue: emit_loop_continue(ctx); break; @@ -10380,6 +10371,12 @@ visit_block(isel_context* ctx, nir_block* block) ctx->unended_linear_vgprs.clear(); } + nir_foreach_phi (instr, block) + visit_phi(ctx, instr); + + nir_phi_instr* last_phi = nir_block_last_phi_instr(block); + begin_empty_exec_skip(ctx, last_phi ? &last_phi->instr : NULL, block); + ctx->block->instructions.reserve(ctx->block->instructions.size() + exec_list_length(&block->instr_list) * 2); nir_foreach_instr (instr, block) { @@ -10388,7 +10385,7 @@ visit_block(isel_context* ctx, nir_block* block) case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break; case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break; - case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break; + case nir_instr_type_phi: break; case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break; case nir_instr_type_deref: break; case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; @@ -10399,8 +10396,8 @@ visit_block(isel_context* ctx, nir_block* block) } static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond); -static void begin_uniform_if_else(isel_context* ctx, if_context* ic); -static void end_uniform_if(isel_context* ctx, if_context* ic); +static void begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else = true); +static void end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else = true); static void visit_loop(isel_context* ctx, nir_loop* loop) @@ -10418,8 +10415,6 @@ static void begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, nir_selection_control sel_ctrl = nir_selection_control_none) { - ic->cond = cond; - append_logical_end(ctx->block); ctx->block->kind |= block_kind_branch; @@ -10568,7 +10563,9 @@ end_divergent_if(isel_context* ctx, if_context* ic) static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) { - assert(cond.regClass() == s1); + assert(!cond.id() || cond.regClass() == s1); + + ic->cond = cond; append_logical_end(ctx->block); ctx->block->kind |= block_kind_uniform; @@ -10577,8 +10574,13 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) aco_opcode branch_opcode = aco_opcode::p_cbranch_z; branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); - branch->operands[0] = Operand(cond); - branch->operands[0].setPrecolored(scc); + if (cond.id()) { + branch->operands[0] = Operand(cond); + branch->operands[0].setPrecolored(scc); + } else { + branch->operands[0] = Operand(exec, ctx->program->lane_mask); + branch->branch().rarely_taken = true; + } ctx->block->instructions.emplace_back(std::move(branch)); ic->BB_if_idx = ctx->block->index; @@ -10592,7 +10594,8 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue; /** emit then block */ - ctx->program->next_uniform_if_depth++; + if (ic->cond.id()) + ctx->program->next_uniform_if_depth++; Block* BB_then = ctx->program->create_and_insert_block(); add_edge(ic->BB_if_idx, BB_then); append_logical_start(BB_then); @@ -10600,7 +10603,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) } static void -begin_uniform_if_else(isel_context* ctx, if_context* ic) +begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else) { Block* BB_then = ctx->block; @@ -10628,25 +10631,30 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic) /** emit else block */ Block* BB_else = ctx->program->create_and_insert_block(); - add_edge(ic->BB_if_idx, BB_else); - append_logical_start(BB_else); + if (logical_else) { + add_edge(ic->BB_if_idx, BB_else); + append_logical_start(BB_else); + } else { + add_linear_edge(ic->BB_if_idx, BB_else); + } ctx->block = BB_else; } static void -end_uniform_if(isel_context* ctx, if_context* ic) +end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else) { Block* BB_else = ctx->block; if (!ctx->cf_info.has_branch) { - append_logical_end(BB_else); + if (logical_else) + append_logical_end(BB_else); /* branch from then block to endif block */ aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); BB_else->instructions.emplace_back(std::move(branch)); add_linear_edge(BB_else->index, &ic->BB_endif); - if (!ctx->cf_info.parent_loop.has_divergent_branch) + if (logical_else && !ctx->cf_info.parent_loop.has_divergent_branch) add_logical_edge(BB_else->index, &ic->BB_endif); BB_else->kind |= block_kind_uniform; } @@ -10657,7 +10665,8 @@ end_uniform_if(isel_context* ctx, if_context* ic) ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then; /** emit endif merge block */ - ctx->program->next_uniform_if_depth--; + if (ic->cond.id()) + ctx->program->next_uniform_if_depth--; ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); append_logical_start(ctx->block); @@ -10665,6 +10674,74 @@ end_uniform_if(isel_context* ctx, if_context* ic) assert(!ctx->block->logical_preds.empty()); } +static void +end_empty_exec_skip(isel_context* ctx) +{ + if (ctx->cf_info.skipping_empty_exec) { + begin_uniform_if_else(ctx, &ctx->cf_info.empty_exec_skip, false); + end_uniform_if(ctx, &ctx->cf_info.empty_exec_skip, false); + ctx->cf_info.skipping_empty_exec = false; + + ctx->cf_info.exec.combine(ctx->cf_info.empty_exec_skip.exec_old); + } +} + +/* + * If necessary, begin a branch which skips over instructions if exec is empty. + * + * The linear CFG: + * BB_IF + * / \ + * BB_THEN (logical) BB_ELSE (linear) + * \ / + * BB_ENDIF + * + * The logical CFG: + * BB_IF + * | + * BB_THEN (logical) + * | + * BB_ENDIF + * + * BB_THEN should not end with a branch, since that would make BB_ENDIF unreachable. + */ +static void +begin_empty_exec_skip(isel_context* ctx, nir_instr* after_instr, nir_block* block) +{ + if (!ctx->cf_info.exec.potentially_empty_discard && !ctx->cf_info.exec.potentially_empty_break && + !ctx->cf_info.exec.potentially_empty_continue) + return; + + assert(!(ctx->block->kind & block_kind_top_level)); + + bool further_cf_empty = !nir_cf_node_next(&block->cf_node); + + bool rest_of_block_empty = false; + if (after_instr) { + rest_of_block_empty = + nir_instr_is_last(after_instr) || nir_instr_next(after_instr)->type == nir_instr_type_jump; + } else { + rest_of_block_empty = exec_list_is_empty(&block->instr_list) || + nir_block_first_instr(block)->type == nir_instr_type_jump; + } + + assert(!(ctx->block->kind & block_kind_export_end) || rest_of_block_empty); + + if (rest_of_block_empty && further_cf_empty) + return; + + /* Don't nest these skipping branches. It is not worth the complexity. */ + end_empty_exec_skip(ctx); + + begin_uniform_if_then(ctx, &ctx->cf_info.empty_exec_skip, Temp()); + ctx->cf_info.skipping_empty_exec = true; + + ctx->cf_info.empty_exec_skip.exec_old = ctx->cf_info.exec; + ctx->cf_info.exec = exec_info(); + + ctx->program->should_repair_ssa = true; +} + static void visit_if(isel_context* ctx, nir_if* if_stmt) { @@ -10739,6 +10816,13 @@ visit_if(isel_context* ctx, nir_if* if_stmt) static void visit_cf_list(isel_context* ctx, struct exec_list* list) { + if (nir_cf_list_is_empty_block(list)) + return; + + bool skipping_empty_exec_old = ctx->cf_info.skipping_empty_exec; + if_context empty_exec_skip_old = std::move(ctx->cf_info.empty_exec_skip); + ctx->cf_info.skipping_empty_exec = false; + foreach_list_typed (nir_cf_node, node, node, list) { switch (node->type) { case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break; @@ -10747,6 +10831,10 @@ visit_cf_list(isel_context* ctx, struct exec_list* list) default: unreachable("unimplemented cf list type"); } } + + end_empty_exec_skip(ctx); + ctx->cf_info.skipping_empty_exec = skipping_empty_exec_old; + ctx->cf_info.empty_exec_skip = std::move(empty_exec_skip_old); } static void diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h index d7464811def..3a0bff49678 100644 --- a/src/amd/compiler/aco_instruction_selection.h +++ b/src/amd/compiler/aco_instruction_selection.h @@ -60,6 +60,22 @@ struct exec_info { } }; +struct if_context { + Temp cond; + + bool divergent_old; + bool had_divergent_discard_old; + bool had_divergent_discard_then; + bool has_divergent_continue_old; + bool has_divergent_continue_then; + struct exec_info exec_old; + + unsigned BB_if_idx; + unsigned invert_idx; + Block BB_invert; + Block BB_endif; +}; + struct isel_context { const struct aco_compiler_options* options; const struct ac_shader_args* args; @@ -85,6 +101,9 @@ struct isel_context { bool had_divergent_discard = false; struct exec_info exec; + + bool skipping_empty_exec = false; + if_context empty_exec_skip; } cf_info; /* NIR range analysis. */ diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index de0c4582a4c..a1a6c528d04 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -772,7 +772,10 @@ BEGIN_TEST(isel.cf.uniform_if_branch_use) { /* The contents of this branch is moved to the merge block. */ //>> BB14 - //! /* logical preds: BB13, / linear preds: BB12, BB13, / kind: uniform, continue, */ + //! /* logical preds: BB13, / linear preds: BB12, BB13, / kind: uniform, */ + //>> s2: %_ = p_cbranch_z %0:exec rarely_taken + //! BB15 + //! /* logical preds: BB14, / linear preds: BB14, / kind: uniform, */ //! p_logical_start //! s1: %val = p_unit_test 0 val = nir_unit_test_uniform_amd(nb, 1, 32, .base=0); @@ -781,6 +784,9 @@ BEGIN_TEST(isel.cf.uniform_if_branch_use) //! p_unit_test 1, %val nir_unit_test_amd(nb, val, .base=1); + + //>> BB17 + //! /* logical preds: BB15, / linear preds: BB15, BB16, / kind: uniform, continue, */ } nir_pop_loop(nb, NULL); @@ -944,8 +950,8 @@ END_TEST /** * loop { * use(phi(, a)) - * terminate_if * loop { + * terminate_if * if (uniform) { * a = loop_invariant_sgpr * break @@ -967,7 +973,7 @@ BEGIN_TEST(isel.cf.hidden_break_no_lcssa_header_phi) nir_loop *loop = nir_push_loop(nb); { //>> BB1 - //! /* logical preds: BB0, BB11, / linear preds: BB0, BB13, / kind: uniform, loop-preheader, loop-header, discard, */ + //! /* logical preds: BB0, BB17, / linear preds: BB0, BB19, / kind: uniform, loop-preheader, loop-header, */ //! s1: %phi = p_linear_phi %init, %val_lcssa //! p_logical_start //! p_unit_test 1, %phi @@ -976,31 +982,31 @@ BEGIN_TEST(isel.cf.hidden_break_no_lcssa_header_phi) nir_phi_instr_add_src(phi, init->parent_instr->block, init); nir_unit_test_amd(nb, &phi->def, .base = 1); - nir_terminate_if(nb, nir_unit_test_divergent_amd(nb, 1, 1, .base = 2)); - //>> BB2 - //! /* logical preds: BB1, BB5, / linear preds: BB1, BB7, / kind: uniform, loop-header, */ + //! /* logical preds: BB1, BB8, / linear preds: BB1, BB10, / kind: uniform, loop-header, discard, */ nir_push_loop(nb); { + nir_terminate_if(nb, nir_unit_test_divergent_amd(nb, 1, 1, .base = 2)); + nir_push_if(nb, nir_unit_test_uniform_amd(nb, 1, 1, .base = 3)); { - //>> BB3 - //! /* logical preds: BB2, / linear preds: BB2, / kind: uniform, break, */ + //>> BB4 + //! /* logical preds: BB3, / linear preds: BB3, / kind: uniform, break, */ //! p_logical_start //! s1: %val = p_parallelcopy 0 val = nir_imm_zero(nb, 1, 32); nir_jump(nb, nir_jump_break); } nir_pop_if(nb, NULL); - //>> BB5 - //! /* logical preds: BB4, / linear preds: BB4, / kind: uniform, continue_or_break, */ + //>> BB8 + //! /* logical preds: BB6, / linear preds: BB6, BB7, / kind: uniform, continue_or_break, */ } nir_pop_loop(nb, NULL); - //>> BB8 - //! /* logical preds: BB3, / linear preds: BB3, BB6, / kind: uniform, loop-exit, */ - //! s1: %val_lcssa = p_linear_phi %val, s1: undef //>> BB11 - //! /* logical preds: BB10, / linear preds: BB10, / kind: uniform, continue_or_break, */ + //! /* logical preds: BB4, / linear preds: BB4, BB9, / kind: uniform, loop-exit, */ + //! s1: %val_lcssa = p_linear_phi %val, s1: undef + //>> BB17 + //! /* logical preds: BB15, / linear preds: BB15, BB16, / kind: uniform, continue_or_break, */ nir_phi_instr_add_src(phi, nir_cursor_current_block(nb->cursor), val); }