aco: refactor and speed-up dead code analysis

Assuming that no loop header phis are dead code, we can perform the dead code analysis in a single iteration. Totals from 25 (0.03% of 79330) affected shaders: (GFX11) MaxWaves: 664 -> 662 (-0.30%) Instrs: 487618 -> 488822 (+0.25%) CodeSize: 2451548 -> 2459756 (+0.33%) VGPRs: 1296 -> 1332 (+2.78%) Latency: 2337256 -> 2338098 (+0.04%); split: -0.00%, +0.04% InvThroughput: 560682 -> 576158 (+2.76%) VClause: 15782 -> 15790 (+0.05%) Copies: 37905 -> 38731 (+2.18%) PreVGPRs: 1124 -> 1156 (+2.85%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26901>
2026-05-07 09:18:04 +02:00 · 2024-01-05 08:22:36 +01:00 · 2024-01-05 08:22:36 +01:00 · dce695b24f
commit dce695b24f
parent a37f43e422
2 changed files with 33 additions and 41 deletions
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@ -30,51 +30,40 @@
 /*
 * Implements an analysis pass to determine the number of uses
 * for each SSA-definition.
+ *
+ * This pass assumes that no loop header phis are dead code.
 */

 namespace aco {
 namespace {

-struct dce_ctx {
-   int current_block;
-   std::vector<uint16_t> uses;
-   std::vector<std::vector<bool>> live;
-
-   dce_ctx(Program* program)
-       : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
-   {
-      live.reserve(program->blocks.size());
-      for (Block& block : program->blocks)
-         live.emplace_back(block.instructions.size());
-   }
-};
-
 void
-process_block(dce_ctx& ctx, Block& block)
+process_loop_header_phis(std::vector<uint16_t>& uses, Block& block)
 {
-   std::vector<bool>& live = ctx.live[block.index];
-   assert(live.size() == block.instructions.size());
-   bool process_predecessors = false;
-   for (int idx = block.instructions.size() - 1; idx >= 0; idx--) {
-      if (live[idx])
-         continue;
-
-      aco_ptr<Instruction>& instr = block.instructions[idx];
-      if (!is_dead(ctx.uses, instr.get())) {
-         for (const Operand& op : instr->operands) {
-            if (op.isTemp()) {
-               if (ctx.uses[op.tempId()] == 0)
-                  process_predecessors = true;
-               ctx.uses[op.tempId()]++;
-            }
-         }
-         live[idx] = true;
+   for (aco_ptr<Instruction>& instr : block.instructions) {
+      if (!is_phi(instr))
+         return;
+      for (const Operand& op : instr->operands) {
+         if (op.isTemp())
+            uses[op.tempId()]++;
      }
   }
+}

-   if (process_predecessors) {
-      for (unsigned pred_idx : block.linear_preds)
-         ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
+void
+process_block(std::vector<uint16_t>& uses, Block& block)
+{
+   for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); it++) {
+      aco_ptr<Instruction>& instr = *it;
+      if ((block.kind & block_kind_loop_header) && is_phi(instr))
+         break;
+
+      if (!is_dead(uses, instr.get())) {
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp())
+               uses[op.tempId()]++;
+         }
+      }
   }
 }

@ -83,15 +72,17 @@ process_block(dce_ctx& ctx, Block& block)
 std::vector<uint16_t>
 dead_code_analysis(Program* program)
 {
+   std::vector<uint16_t> uses(program->peekAllocationId());

-   dce_ctx ctx(program);
-
-   while (ctx.current_block >= 0) {
-      unsigned next_block = ctx.current_block--;
-      process_block(ctx, program->blocks[next_block]);
+   for (Block& block : program->blocks) {
+      if (block.kind & block_kind_loop_header)
+         process_loop_header_phis(uses, block);
   }

-   return ctx.uses;
+   for (auto it = program->blocks.rbegin(); it != program->blocks.rend(); it++)
+      process_block(uses, *it);
+
+   return uses;
 }

 } // namespace aco
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -251,6 +251,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>

      /* create ssa name for restore mask */
      if (info.has_divergent_break) {
+         // TODO: this phi is unnecessary if we end WQM immediately after the loop
         /* this phi might be trivial but ensures a parallelcopy on the loop header */
         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
            aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};