From 8a7d34e3bdf35dee35bd1d1fd43738d8ebe05218 Mon Sep 17 00:00:00 2001
From: Gert Wollny <gert.wollny@collabora.com>
Date: Thu, 21 Jul 2022 17:52:48 +0200
Subject: [PATCH] r600/sfn: Fix the kcache failure handling

Instead of starting a new block when the kcache handling failed,
try to continue scheduling instructions until kcache allocation
fails for all ready instruction.
With that we avoid a CF split withing an LDS fetch/read group.

Signed-off-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17678>
---
 src/gallium/drivers/r600/sfn/sfn_instr.cpp    |  75 ++++++---
 src/gallium/drivers/r600/sfn/sfn_instr.h      |  10 +-
 .../drivers/r600/sfn/sfn_scheduler.cpp        | 144 ++++++++++--------
 3 files changed, 138 insertions(+), 91 deletions(-)

diff --git a/src/gallium/drivers/r600/sfn/sfn_instr.cpp b/src/gallium/drivers/r600/sfn/sfn_instr.cpp
index d81e329531e..6ab2518d3fa 100644
--- a/src/gallium/drivers/r600/sfn/sfn_instr.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_instr.cpp
@@ -302,17 +302,43 @@ void Block::push_back(PInst instr)
 
 bool Block::try_reserve_kcache(const AluGroup& group)
 {
+   auto kcache = m_kcache;
+
    auto kcache_constants = group.get_kconsts();
    for (auto& kc : kcache_constants)  {
       auto u = kc->as_uniform();
       assert(u);
-      if (!try_reserve_kcache(*u))
+      if (!try_reserve_kcache(*u, kcache)) {
+         m_kcache_alloc_failed = true;
          return false;
+      }
    }
+
+   m_kcache = kcache;
+   m_kcache_alloc_failed = false;
    return true;
 }
 
-bool Block::try_reserve_kcache(const UniformValue& u)
+bool Block::try_reserve_kcache(const AluInstr& instr)
+{
+   auto kcache = m_kcache;
+
+   for (auto& src : instr.sources()) {
+      auto u = src->as_uniform();
+      if (u) {
+         if (!try_reserve_kcache(*u, kcache)) {
+            m_kcache_alloc_failed = true;
+            return false;
+         }
+      }
+   }
+   m_kcache = kcache;
+   m_kcache_alloc_failed = false;
+   return true;
+}
+
+bool Block::try_reserve_kcache(const UniformValue& u,
+                               std::array<KCacheLine, 4>& kcache) const
 {
    const int kcache_banks = 4; // TODO: handle pre-evergreen
 
@@ -323,49 +349,50 @@ bool Block::try_reserve_kcache(const UniformValue& u)
    bool found = false;
 
    for (int i = 0; i < kcache_banks && !found; ++i) {
-      if (m_kcache[i].mode) {
-         if (m_kcache[i].bank < bank)
+      if (kcache[i].mode) {
+         if (kcache[i].bank < bank)
             continue;
 
-         if ((m_kcache[i].bank == bank &&
-              m_kcache[i].addr > line  + 1) ||
-             m_kcache[i].bank > bank) {
-            if (m_kcache[kcache_banks - 1].mode)
+         if ((kcache[i].bank == bank &&
+              kcache[i].addr > line  + 1) ||
+             kcache[i].bank > bank) {
+            if (kcache[kcache_banks - 1].mode)
                return false;
 
-            memmove(&m_kcache[i+1],&m_kcache[i], (kcache_banks-i-1)*sizeof(KCacheLine));
-            m_kcache[i].mode = KCacheLine::lock_1;
-            m_kcache[i].bank = bank;
-            m_kcache[i].addr = line;
+            memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(KCacheLine));
+            kcache[i].mode = KCacheLine::lock_1;
+            kcache[i].bank = bank;
+            kcache[i].addr = line;
             return true;
          }
 
-         int d = line - m_kcache[i].addr;
+         int d = line - kcache[i].addr;
 
          if (d == -1) {
-            m_kcache[i].addr--;
-            if (m_kcache[i].mode == KCacheLine::lock_2) {
+            kcache[i].addr--;
+            if (kcache[i].mode == KCacheLine::lock_2) {
                /* we are prepending the line to the current set,
-          * discarding the existing second line,
-          * so we'll have to insert line+2 after it */
+                * discarding the existing second line,
+                * so we'll have to insert line+2 after it */
                line += 2;
                continue;
-            } else if (m_kcache[i].mode == KCacheLine::lock_1) {
-               m_kcache[i].mode = KCacheLine::lock_2;
+            } else if (kcache[i].mode == KCacheLine::lock_1) {
+               kcache[i].mode = KCacheLine::lock_2;
                return true;
             } else {
                /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
                return false;
             }
          } else if (d == 1) {
-            m_kcache[i].mode = KCacheLine::lock_2;
+            kcache[i].mode = KCacheLine::lock_2;
             return true;
-         } else if (d == 0)
+         } else if (d == 0) {
             return true;
+         }
       } else { /* free kcache set - use it */
-         m_kcache[i].mode = KCacheLine::lock_1;
-         m_kcache[i].bank = bank;
-         m_kcache[i].addr = line;
+         kcache[i].mode = KCacheLine::lock_1;
+         kcache[i].bank = bank;
+         kcache[i].addr = line;
          return true;
       }
    }
diff --git a/src/gallium/drivers/r600/sfn/sfn_instr.h b/src/gallium/drivers/r600/sfn/sfn_instr.h
index c70427e8575..19f118149bd 100644
--- a/src/gallium/drivers/r600/sfn/sfn_instr.h
+++ b/src/gallium/drivers/r600/sfn/sfn_instr.h
@@ -196,7 +196,8 @@ public:
    void set_type(Type t);
    uint32_t remaining_slots() const { return m_remaining_slots;}
 
-   bool try_reserve_kcache(const AluGroup& group);
+   bool try_reserve_kcache(const AluGroup& instr);
+   bool try_reserve_kcache(const AluInstr& group);
 
    auto last_lds_instr() {return m_last_lds_instr;}
    void set_last_lds_instr(Instr *instr) {m_last_lds_instr = instr;}
@@ -207,8 +208,11 @@ public:
 
    size_t size() const { return m_instructions.size();}
 
+   bool kcache_reservation_failed() const { return m_kcache_alloc_failed;}
+
 private:
-   bool try_reserve_kcache(const UniformValue& u);
+   bool try_reserve_kcache(const UniformValue& u,
+                           std::array<KCacheLine, 4>& kcache) const;
 
    bool do_ready() const override {return true;};
    void do_print(std::ostream& os) const override;
@@ -221,11 +225,13 @@ private:
    uint32_t m_remaining_slots{0xffff};
 
    std::array<KCacheLine, 4> m_kcache;
+   bool m_kcache_alloc_failed{false};
 
    Instr *m_last_lds_instr{nullptr};
 
    int m_lds_group_requirement{0};
    AluInstr *m_lds_group_start{nullptr};
+
 };
 
 class InstrWithVectorResult : public Instr {
diff --git a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
index faf116a5cdb..87ab579f8a3 100644
--- a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
@@ -489,83 +489,84 @@ bool BlockSheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
    bool has_lds_ready = !alu_vec_ready.empty() &&
                         (*alu_vec_ready.begin())->has_lds_access();
 
+   /* If we have ready ALU instructions we have to start a new ALU block */
+   if (has_alu_ready ||  !alu_groups_ready.empty()) {
+      if (m_current_block->type() != Block::alu) {
+         start_new_block(out_blocks, Block::alu);
+         m_alu_groups_schduled = 0;
+      }
+   }
+
    /* Schedule groups first. unless we have a pending LDS instuction
     * We don't want the LDS instructions to be too far apart because the
     * fetch + read from queue has to be in the same ALU CF block */
    if (!alu_groups_ready.empty() && !has_lds_ready) {
       group = *alu_groups_ready.begin();
-      alu_groups_ready.erase(alu_groups_ready.begin());
-      sfn_log << SfnLog::schedule << "Schedule ALU group\n";
-      success = true;
-   } else {
-      if (has_alu_ready) {
-         group = new AluGroup();
-         sfn_log << SfnLog::schedule << "START new ALU group\n";
-      }
-   }
-
-   if (group) {
-      int free_slots = group->free_slots();
-
-      if (free_slots && has_alu_ready) {
-         if (!alu_vec_ready.empty())
-            success |= schedule_alu_to_group_vec(group);
-
-         /* Apparently one can't schedule a t-slot if there is already
-          * and LDS instruction scheduled.
-          * TODO: check whether this is only relevant for actual LDS instructions
-          * or also for instructions that read from the LDS return value queue */
-
-         if (free_slots & 0x10 && !has_lds_ready) {
-            sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
-            if (!alu_trans_ready.empty())
-               success |= schedule_alu_to_group_trans(group, alu_trans_ready);
-            if (!alu_vec_ready.empty())
-               success |= schedule_alu_to_group_trans(group, alu_vec_ready);
-         }
-      }
-
-      sfn_log << SfnLog::schedule << "Finalize ALU group\n";
-      group->set_scheduled();
-      group->fix_last_flag();
-      group->set_nesting_depth(m_current_block->nesting_depth());
-
-
-      if (m_current_block->type() != Block::alu) {
-         start_new_block(out_blocks, Block::alu);
-         m_alu_groups_schduled = 0;
-      }
-
-      /* Pessimistic hack: If we have started an LDS group,
-       * make sure 8 instructions groups still fit into the CF
-       * TODO: take care of Address slot emission
-       * TODO: maybe do this CF split only in the assembler
-       */
-      /*if (group->slots() > m_current_block->remaining_slots() ||
-          (group->has_lds_group_start() &&
-           m_current_block->remaining_slots() < 7 * 8)) {
-         //assert(!m_current_block->lds_group_active());
-         start_new_block(out_blocks, Block::alu);
-      }*/
-
       if (!m_current_block->try_reserve_kcache(*group)) {
-         assert(!m_current_block->lds_group_active());
          start_new_block(out_blocks, Block::alu);
          m_current_block->set_instr_flag(Instr::force_cf);
       }
 
-      assert(m_current_block->try_reserve_kcache(*group));
-
-      if (group->has_lds_group_start())
-         m_current_block->lds_group_start(*group->begin());
-
-      m_current_block->push_back(group);
-      if (group->has_lds_group_end())
-         m_current_block->lds_group_end();
+      if (!m_current_block->try_reserve_kcache(*group))
+         unreachable("Scheduling a group in a new block should always succeed");
+      alu_groups_ready.erase(alu_groups_ready.begin());
+      sfn_log << SfnLog::schedule << "Schedule ALU group\n";
+      success = true;
+   } else if (has_alu_ready) {
+      group = new AluGroup();
+      sfn_log << SfnLog::schedule << "START new ALU group\n";
+   } else {
+      return false;
    }
 
-   if (success)
-      ++m_alu_groups_schduled;
+   assert(group);
+
+   int free_slots = group->free_slots();
+
+   while (free_slots && has_alu_ready) {
+      if (!alu_vec_ready.empty())
+         success |= schedule_alu_to_group_vec(group);
+
+      /* Apparently one can't schedule a t-slot if there is already
+       * and LDS instruction scheduled.
+       * TODO: check whether this is only relevant for actual LDS instructions
+       * or also for instructions that read from the LDS return value queue */
+
+      if (free_slots & 0x10 && !has_lds_ready) {
+         sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
+         if (!alu_trans_ready.empty())
+            success |= schedule_alu_to_group_trans(group, alu_trans_ready);
+         if (!alu_vec_ready.empty())
+            success |= schedule_alu_to_group_trans(group, alu_vec_ready);
+      }
+
+      if (success) {
+         ++m_alu_groups_schduled;
+         break;
+      } else if (m_current_block->kcache_reservation_failed()) {
+         // LDS read groups should not lead to impossible
+         // kcache constellations
+         assert(!m_current_block->lds_group_active());
+
+         // kcache reservation failed, so we have to start a new CF
+         start_new_block(out_blocks, Block::alu);
+         m_current_block->set_instr_flag(Instr::force_cf);
+      } else {
+         return false;
+      }
+   }
+
+   sfn_log << SfnLog::schedule << "Finalize ALU group\n";
+   group->set_scheduled();
+   group->fix_last_flag();
+   group->set_nesting_depth(m_current_block->nesting_depth());
+   m_current_block->push_back(group);
+
+   if (group->has_lds_group_start())
+      m_current_block->lds_group_start(*group->begin());
+
+   if (group->has_lds_group_end())
+      m_current_block->lds_group_end();
 
    return success;
 }
@@ -652,6 +653,13 @@ bool BlockSheduler::schedule_alu_to_group_vec(AluGroup *group)
    auto e = alu_vec_ready.end();
    while (i != e) {
       sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
+
+      if (!m_current_block->try_reserve_kcache(**i)) {
+           sfn_log << SfnLog::schedule << " failed (kcache)\n";
+         ++i;
+         continue;
+      }
+
       if (group->add_vec_instructions(*i)) {
          auto old_i = i;
          ++i;
@@ -679,6 +687,12 @@ bool BlockSheduler::schedule_alu_to_group_trans(AluGroup *group, std::list<AluIn
    auto e = readylist.end();
    while (i != e) {
       sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
+      if (!m_current_block->try_reserve_kcache(**i)) {
+           sfn_log << SfnLog::schedule << " failed (kcache)\n";
+         ++i;
+         continue;
+      }
+
       if (group->add_trans_instructions(*i)) {
          auto old_i = i;
          ++i;