aco: optimize conditional divergent breaks at the end of loops

Removes one branch and one s_mov. Foz-DB Navi21: Totals from 1483 (1.87% of 79395) affected shaders: Instrs: 6424114 -> 6373084 (-0.79%) CodeSize: 35309320 -> 35091084 (-0.62%); split: -0.63%, +0.01% Latency: 87950935 -> 88030841 (+0.09%); split: -0.03%, +0.12% InvThroughput: 24784756 -> 24799536 (+0.06%); split: -0.02%, +0.08% Copies: 588743 -> 561805 (-4.58%) Branches: 242521 -> 215578 (-11.11%) SALU: 877856 -> 850918 (-3.07%) Signed-off-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19070>
2026-05-05 11:48:06 +02:00 · 2022-10-13 13:56:42 +02:00 · 2022-10-13 13:56:42 +02:00 · 6c73a8a7f2
commit 6c73a8a7f2
parent 075c5818cb
1 changed files with 134 additions and 0 deletions
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@ -276,6 +276,12 @@ try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
   block->linear_succs.clear();
 }

+bool
+is_simple_copy(Instruction* instr)
+{
+   return instr->opcode == aco_opcode::p_parallelcopy && instr->definitions.size() == 1;
+}
+
 bool
 instr_writes_exec(Instruction* instr)
 {
@ -319,6 +325,131 @@ instr_accesses(Instruction* instr, const T& a, bool ignore_reads)
   return false;
 }

+void
+try_merge_break_with_continue(ssa_elimination_ctx& ctx, Block* block)
+{
+   /* Look for this:
+    * BB1:
+    *    ...
+    *    p_branch_z exec BB3, BB2
+    * BB2:
+    *    ...
+    *    s[0:1], scc = s_andn2 s[0:1], exec
+    *    p_branch_z scc BB4, BB3
+    * BB3:
+    *    exec = p_parallelcopy s[0:1]
+    *    p_branch BB1
+    * BB4:
+    *    ...
+    *
+    * And turn it into this:
+    * BB1:
+    *    ...
+    *    p_branch_z exec BB3, BB2
+    * BB2:
+    *    ...
+    *    p_branch BB3
+    * BB3:
+    *    s[0:1], scc, exec = s_andn2_wrexec s[0:1], exec
+    *    p_branch_nz scc BB1, BB4
+    * BB4:
+    *    ...
+    */
+   if (block->linear_succs.size() != 2 || block->instructions.size() < 2)
+      return;
+
+   Pseudo_branch_instruction* branch = &block->instructions.back()->branch();
+   if (branch->operands[0].physReg() != scc || branch->opcode != aco_opcode::p_cbranch_z)
+      return;
+
+   Block* merge = &ctx.program->blocks[branch->target[1]];
+   Block* loopexit = &ctx.program->blocks[branch->target[0]];
+
+   /* Just a jump to the loop header. */
+   if (merge->linear_succs.size() != 1)
+      return;
+
+   /* We want to use the loopexit as the fallthrough block from merge,
+    * so there shouldn't be a block inbetween.
+    */
+   for (unsigned i = merge->index + 1; i < loopexit->index; i++) {
+      if (!ctx.program->blocks[i].instructions.empty())
+         return;
+   }
+
+   for (unsigned merge_pred : merge->linear_preds) {
+      Block* pred = &ctx.program->blocks[merge_pred];
+      if (pred == block)
+         continue;
+
+      Instruction* pred_branch = pred->instructions.back().get();
+      /* The branch needs to be exec zero only, otherwise we corrupt exec. */
+      if (!pred_branch->isBranch() || pred_branch->opcode != aco_opcode::p_cbranch_z ||
+          pred_branch->operands[0].physReg() != exec)
+         return;
+   }
+
+   /* merge block: copy to exec, logical_start, logical_end, branch */
+   if (merge->instructions.size() != 4 || !ctx.logical_phi_info[merge->index].empty() ||
+       !ctx.linear_phi_info[merge->index].empty() || !is_empty_block(merge, true))
+      return;
+
+   aco_ptr<Instruction>& execwrite = merge->instructions[0];
+   if (!is_simple_copy(execwrite.get()) || execwrite->definitions[0].physReg() != exec)
+      return;
+
+   const aco_opcode andn2 =
+      ctx.program->lane_mask == s2 ? aco_opcode::s_andn2_b64 : aco_opcode::s_andn2_b32;
+   const aco_opcode andn2_wrexec = ctx.program->lane_mask == s2 ? aco_opcode::s_andn2_wrexec_b64
+                                                                : aco_opcode::s_andn2_wrexec_b32;
+
+   auto execsrc_it = block->instructions.end() - 2;
+   if ((*execsrc_it)->opcode != andn2 ||
+       (*execsrc_it)->definitions[0].physReg() != execwrite->operands[0].physReg() ||
+       (*execsrc_it)->operands[0].physReg() != execwrite->operands[0].physReg() ||
+       (*execsrc_it)->operands[1].physReg() != exec)
+      return;
+
+   assert(ctx.linear_phi_info[block->index].empty());
+
+   /* Move s_andn2 to the merge block. */
+   merge->instructions.insert(merge->instructions.begin(), std::move(*execsrc_it));
+   block->instructions.erase(execsrc_it);
+
+   branch->target[0] = merge->linear_succs[0];
+   branch->target[1] = loopexit->index;
+   branch->opcode = aco_opcode::p_cbranch_nz;
+
+   merge->instructions.back()->branch().target[0] = merge->index;
+   std::swap(merge->instructions.back(), block->instructions.back());
+   std::swap(merge->instructions.back()->definitions[0],
+             block->instructions.back()->definitions[0]);
+
+   block->linear_succs.clear();
+   block->linear_succs.push_back(merge->index);
+   merge->linear_succs.push_back(loopexit->index);
+   std::swap(merge->linear_succs[0], merge->linear_succs[1]);
+   ctx.blocks_incoming_exec_used[merge->index] = true;
+
+   std::replace(loopexit->linear_preds.begin(), loopexit->linear_preds.end(), block->index,
+                merge->index);
+
+   if (ctx.program->gfx_level < GFX9)
+      return;
+
+   /* Combine s_andn2 and copy to exec to s_andn2_wrexec. */
+   Instruction* r_exec = merge->instructions[0].get();
+   Instruction* wr_exec = create_instruction(andn2_wrexec, Format::SOP1, 2, 3);
+   wr_exec->operands[0] = r_exec->operands[0];
+   wr_exec->operands[1] = r_exec->operands[1];
+   wr_exec->definitions[0] = r_exec->definitions[0];
+   wr_exec->definitions[1] = r_exec->definitions[1];
+   wr_exec->definitions[2] = Definition(exec, ctx.program->lane_mask);
+
+   merge->instructions.erase(merge->instructions.begin());
+   merge->instructions[0].reset(wr_exec);
+}
+
 void
 try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const int exec_val_idx,
                                const int exec_copy_idx)
@ -647,6 +778,9 @@ jump_threading(ssa_elimination_ctx& ctx)
      Block* block = &ctx.program->blocks[i];
      eliminate_useless_exec_writes_in_block(ctx, *block);

+      if (block->kind & block_kind_break)
+         try_merge_break_with_continue(ctx, block);
+
      if (!ctx.empty_blocks[i])
         continue;