From df1de388a3232be7e70c3cd47651b0da4e0367d2 Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Fri, 24 Jan 2025 08:42:00 +0100
Subject: [PATCH] aco/sched_ilp: reorder VINTRP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VINTRP(gfx6-gfx10.3) is mostly just VALU, but we treated it like memory
instructions as an afterthought. This had issues as VINTRP was never reordered
with itself, or other memory instructions. Reordering VINTRP in clauses
increases ILP. We don't really need collect_clause_dependencies for VINTRP
either, because they ususally have the same dependencies already. That means
we can still form VINTRP clauses by selecting preferably VINTRP after a
previous one.

Foz-DB Navi21:
Totals from 34184 (43.16% of 79206) affected shaders:
Instrs: 18811270 -> 18812046 (+0.00%); split: -0.01%, +0.02%
CodeSize: 103627276 -> 103630056 (+0.00%); split: -0.01%, +0.01%
Latency: 188379364 -> 187936731 (-0.23%); split: -0.27%, +0.03%
InvThroughput: 42600163 -> 42590608 (-0.02%); split: -0.03%, +0.00%
VClause: 378960 -> 378912 (-0.01%); split: -0.02%, +0.00%
SClause: 727560 -> 720573 (-0.96%); split: -1.08%, +0.12%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Daniel Schürmann <None>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33111>
---
 src/amd/compiler/aco_scheduler_ilp.cpp       | 19 +++++++++++++------
 src/amd/compiler/tests/test_d3d11_derivs.cpp |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp
index 1adfd43a47a..4589427d9a1 100644
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -80,7 +80,7 @@ struct SchedILPContext {
 bool
 can_reorder(const Instruction* const instr)
 {
-   if (instr->isVALU())
+   if (instr->isVALU() || instr->isVINTRP())
       return true;
    if (!instr->isSALU() || instr->isSOPP())
       return false;
@@ -285,7 +285,7 @@ unsigned
 get_latency(const Instruction* const instr)
 {
    /* Note, that these are not accurate latency estimations. */
-   if (instr->isVALU())
+   if (instr->isVALU() || instr->isVINTRP())
       return 5;
    if (instr->isSALU())
       return 2;
@@ -432,9 +432,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
        * any cases that are actually a concern for clause formation are added as transitive
        * dependencies. */
       write_dep_mask &= ~ctx.non_reorder_mask;
-      /* Ignore RaW for VINTERP. */
-      if (instr->isVINTRP())
-         entry.dependency_mask &= ~ctx.non_reorder_mask;
       ctx.potential_partial_clause = true;
    } else if (ctx.last_non_reorderable != UINT8_MAX) {
       ctx.potential_partial_clause = false;
@@ -558,8 +555,14 @@ select_instruction_ilp(const SchedILPContext& ctx)
       mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
    }
 
+   /* VINTRP(gfx6-10.3) can be handled like alu, but switching between VINTRP and other
+    * alu has a cost. So if the previous instr was VINTRP, try to keep selecting VINTRP.
+    */
+   bool prefer_vintrp = ctx.prev_info.instr && ctx.prev_info.instr->isVINTRP();
+
    /* Select the instruction with highest priority of all candidates. */
    unsigned idx = -1u;
+   bool idx_vintrp = false;
    int32_t priority = INT32_MIN;
    u_foreach_bit (i, mask) {
       const InstrInfo& candidate = ctx.nodes[i];
@@ -568,8 +571,12 @@ select_instruction_ilp(const SchedILPContext& ctx)
       if (candidate.dependency_mask)
          continue;
 
-      if (idx == -1u || candidate.priority > priority) {
+      bool is_vintrp = prefer_vintrp && candidate.instr->isVINTRP();
+
+      if (idx == -1u || (is_vintrp && !idx_vintrp) ||
+          (is_vintrp == idx_vintrp && candidate.priority > priority)) {
          idx = i;
+         idx_vintrp = is_vintrp;
          priority = candidate.priority;
       }
    }
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index 3906d7295e3..69a879da883 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -378,8 +378,8 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
    //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
+   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
    //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
    //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
    //>> BB1: