From f1f01aaef53cd6a9593dade1e2fedcacbc2e45c2 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Wed, 7 Jun 2023 17:08:10 +0100
Subject: [PATCH] aco/gfx11: schedule for VMEM store clauses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fossil-db (gfx1100):
Totals from 49486 (37.09% of 133428) affected shaders:
Instrs: 18376819 -> 18480712 (+0.57%); split: -0.00%, +0.57%
CodeSize: 91810836 -> 92227292 (+0.45%); split: -0.00%, +0.45%
VGPRs: 2031824 -> 2047784 (+0.79%); split: -0.02%, +0.81%
Latency: 104259318 -> 103804792 (-0.44%); split: -0.44%, +0.00%
InvThroughput: 16388760 -> 16399819 (+0.07%); split: -0.13%, +0.19%
VClause: 568844 -> 432401 (-23.99%)
Copies: 1197942 -> 1231202 (+2.78%); split: -0.08%, +2.86%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23505>
---
 src/amd/compiler/aco_scheduler.cpp | 49 +++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 02498c01195..2e43fa2d62a 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -1019,6 +1019,37 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
    }
 }
 
+unsigned
+schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+                    Instruction* current, int idx)
+{
+   hazard_query hq;
+   init_hazard_query(ctx, &hq);
+
+   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
+   unsigned skip = 0;
+
+   for (int i = 0; i < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
+      aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
+      if (candidate->opcode == aco_opcode::p_logical_start)
+         break;
+
+      if (!should_form_clause(current, candidate.get())) {
+         add_to_hazard_query(&hq, candidate.get());
+         ctx.mv.downwards_skip(cursor);
+         continue;
+      }
+
+      if (perform_hazard_query(&hq, candidate.get(), false) != hazard_success ||
+          ctx.mv.downwards_move(cursor, true) != move_success)
+         break;
+
+      skip++;
+   }
+
+   return skip;
+}
+
 void
 schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
 {
@@ -1028,6 +1059,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
 
    /* go through all instructions and find memory loads */
+   unsigned num_stores = 0;
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
       Instruction* current = block->instructions[idx].get();
 
@@ -1040,8 +1072,10 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
          }
       }
 
-      if (current->definitions.empty())
+      if (current->definitions.empty()) {
+         num_stores += current->isVMEM() || current->isFlatLike() ? 1 : 0;
          continue;
+      }
 
       if (current->isVMEM() || current->isFlatLike()) {
          ctx.mv.current = current;
@@ -1054,6 +1088,19 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
       }
    }
 
+   /* GFX11 benefits from creating VMEM store clauses. */
+   if (num_stores > 1 && program->gfx_level >= GFX11) {
+      for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
+         Instruction* current = block->instructions[idx].get();
+         if (!current->definitions.empty() || !(current->isVMEM() || current->isFlatLike()))
+            continue;
+
+         ctx.mv.current = current;
+         idx -=
+            schedule_VMEM_store(ctx, block, live_vars.register_demand[block->index], current, idx);
+      }
+   }
+
    /* resummarize the block's register demand */
    block->register_demand = RegisterDemand();
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {