From 825cd696dc3204b682424717a8e39e34311589e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Mon, 24 Jan 2022 19:43:49 +0100
Subject: [PATCH] aco/insert_exec_mask: stay in WQM while helper lanes are
 still needed

This patch flags all instructions WQM which don't require
Exact mode, but depend on the exec mask as long as WQM
is needed on any control flow path afterwards.
This will mostly prevent accidental copies of WQM values
within Exact mode, and also makes a lot of other workarounds
unnecessary.

Totals from 17374 (12.88% of 134913) affected shaders: (GFX10.3)
VGPRs: 526952 -> 527384 (+0.08%); split: -0.01%, +0.09%
CodeSize: 33740512 -> 33766636 (+0.08%); split: -0.06%, +0.14%
MaxWaves: 488166 -> 488108 (-0.01%); split: +0.00%, -0.02%
Instrs: 6254240 -> 6260557 (+0.10%); split: -0.08%, +0.18%
Latency: 66497580 -> 66463472 (-0.05%); split: -0.15%, +0.10%
InvThroughput: 13265741 -> 13264036 (-0.01%); split: -0.03%, +0.01%
VClause: 122962 -> 122975 (+0.01%); split: -0.01%, +0.02%
SClause: 334805 -> 334405 (-0.12%); split: -0.51%, +0.39%
Copies: 275728 -> 282341 (+2.40%); split: -0.91%, +3.31%
Branches: 92546 -> 90990 (-1.68%); split: -1.68%, +0.00%
PreSGPRs: 504119 -> 504352 (+0.05%); split: -0.00%, +0.05%

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14951>
---
 src/amd/compiler/aco_insert_exec_mask.cpp | 41 +++++++----------------
 src/amd/compiler/aco_ir.cpp               |  1 +
 2 files changed, 14 insertions(+), 28 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 1b0c0b61fd9..555d6e88bb5 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -85,7 +85,6 @@ struct block_info {
    std::vector<WQMState> instr_needs;
    uint8_t block_needs;
    uint8_t ever_again_needs;
-   bool logical_end_wqm;
    /* more... */
 };
 
@@ -129,18 +128,12 @@ mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
    if (ctx.branch_wqm[block_idx])
       return;
 
-   ctx.branch_wqm[block_idx] = true;
-   ctx.worklist.insert(block_idx);
-
-   Block& block = ctx.program->blocks[block_idx];
-
-   /* TODO: this sets more branch conditions to WQM than it needs to
-    * it should be enough to stop at the "exec mask top level" */
-   if (block.kind & block_kind_top_level)
-      return;
-
-   for (unsigned pred_idx : block.logical_preds)
-      mark_block_wqm(ctx, pred_idx);
+   for (Block& block : ctx.program->blocks) {
+      if (block.index >= block_idx && block.kind & block_kind_top_level)
+         break;
+      ctx.branch_wqm[block.index] = true;
+      ctx.worklist.insert(block.index);
+   }
 }
 
 void
@@ -185,18 +178,11 @@ get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
       } else if (preserve_wqm & ctx.ever_again_needs_wqm) {
          /* Preserve WQM if WQM is needed later */
          needs = Preserve_WQM;
+      } else if (needs == Unspecified && info.block_needs & WQM) {
+         needs = pred_by_exec ? WQM : Unspecified;
       }
 
-      /* ensure the condition controlling the control flow for this phi is in WQM */
-      if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
-         for (unsigned pred_idx : block->logical_preds) {
-            mark_block_wqm(ctx, pred_idx);
-            exec_ctx.info[pred_idx].logical_end_wqm = true;
-            ctx.worklist.insert(pred_idx);
-         }
-      }
-
-      if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) ||
+      if ((instr->opcode == aco_opcode::p_logical_end && ctx.branch_wqm[block->index]) ||
           instr->opcode == aco_opcode::p_wqm) {
          assert(needs != Exact);
          needs = WQM;
@@ -210,9 +196,8 @@ get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
 
    /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
     * <cond> should be computed in WQM */
-   if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
-      for (unsigned pred_idx : block->logical_preds)
-         mark_block_wqm(ctx, pred_idx);
+   if (info.block_needs & WQM) {
+      mark_block_wqm(ctx, block->index);
    }
 }
 
@@ -421,8 +406,8 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
 
       if (ctx.handle_wqm) {
          ctx.info[0].exec.emplace_back(start_exec, mask_type_global | mask_type_exact);
-         /* if this block only needs WQM, initialize already */
-         if (ctx.info[0].block_needs == WQM)
+         /* if this block needs WQM, initialize already */
+         if (ctx.info[0].block_needs & WQM)
             transition_to_WQM(ctx, bld, 0);
       } else {
          uint8_t mask = mask_type_global;
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 4f9091f9e24..cd705bc1a18 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -571,6 +571,7 @@ needs_exec_mask(const Instruction* instr)
          return instr->reads_exec();
       case aco_opcode::p_spill:
       case aco_opcode::p_reload:
+      case aco_opcode::p_end_linear_vgpr:
       case aco_opcode::p_logical_start:
       case aco_opcode::p_logical_end:
       case aco_opcode::p_startpgm: return instr->reads_exec();