From f01cac835f95bcc65b2ca0afeed4f29a6bbb64f0 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 3 May 2024 12:04:59 +0100
Subject: [PATCH] aco/stats: support GFX12 in collect_preasm_stats()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29225>
---
 src/amd/compiler/aco_insert_waitcnt.cpp | 21 ---------------------
 src/amd/compiler/aco_ir.cpp             | 15 +++++++++++++++
 src/amd/compiler/aco_ir.h               | 11 +++++++++++
 src/amd/compiler/aco_statistics.cpp     | 19 +++++++++++++------
 4 files changed, 39 insertions(+), 27 deletions(-)
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 277228f6b83..9369240c17f 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -74,12 +74,6 @@ enum counter_type : uint8_t {
    wait_counters = BITFIELD_MASK(wait_type_num),
 };
 
-enum vmem_type : uint8_t {
-   vmem_nosampler = 1 << 0,
-   vmem_sampler = 1 << 1,
-   vmem_bvh = 1 << 2,
-};
-
 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
  * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
  * that we should switch to a different wave and contains info on dependencies as to
@@ -349,21 +343,6 @@ struct wait_ctx {
    }
 };
 
-uint8_t
-get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
-{
-   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
-      return vmem_bvh;
-   else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
-      return vmem_sampler;
-   else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
-            instr->operands[1].regClass() == s4)
-      return vmem_sampler;
-   else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
-      return vmem_nosampler;
-   return 0;
-}
-
 wait_event
 get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
 {
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index efe4611c9d5..a987bfa33ad 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -1414,6 +1414,21 @@ get_op_fixed_to_def(Instruction* instr)
    return -1;
 }
 
+uint8_t
+get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
+{
+   if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
+      return vmem_bvh;
+   else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
+      return vmem_sampler;
+   else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
+            instr->operands[1].regClass() == s4)
+      return vmem_sampler;
+   else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
+      return vmem_nosampler;
+   return 0;
+}
+
 bool
 dealloc_vgprs(Program* program)
 {
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 97049361859..527ee8abb8d 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1781,6 +1781,17 @@ unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index);
 
 bool should_form_clause(const Instruction* a, const Instruction* b);
 
+enum vmem_type : uint8_t {
+   vmem_nosampler = 1 << 0,
+   vmem_sampler = 1 << 1,
+   vmem_bvh = 1 << 2,
+};
+
+/* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
+ * is used.
+ */
+uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr);
+
 enum block_kind {
    /* uniform indicates that leaving this block,
     * all actives lanes stay active */
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index 9260ed94b18..31b5829ee2b 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -288,10 +288,11 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
       else
          info[wait_type_vs] = 320;
    } else if (instr->isSMEM()) {
+      wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
       if (instr->definitions.empty()) {
-         info[wait_type_lgkm] = 200;
+         info[type] = 200;
       } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
-         info[wait_type_lgkm] = 1;
+         info[type] = 1;
       } else {
          bool likely_desc_load = instr->operands[0].size() == 2;
          bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
@@ -299,15 +300,21 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
             instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
 
          if (likely_desc_load || const_offset)
-            info[wait_type_lgkm] = 30; /* likely to hit L0 cache */
+            info[type] = 30; /* likely to hit L0 cache */
          else
-            info[wait_type_lgkm] = 200;
+            info[type] = 200;
       }
    } else if (instr->isDS()) {
       info[wait_type_lgkm] = 20;
+   } else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) {
+      info[wait_type_vs] = 320;
    } else if (instr->isVMEM()) {
-      wait_type type =
-         instr->definitions.empty() && gfx_level >= GFX10 ? wait_type_vs : wait_type_vm;
+      uint8_t vm_type = get_vmem_type(gfx_level, instr.get());
+      wait_type type = wait_type_vm;
+      if (gfx_level >= GFX12 && vm_type == vmem_bvh)
+         type = wait_type_bvh;
+      else if (gfx_level >= GFX12 && vm_type == vmem_sampler)
+         type = wait_type_sample;
       info[type] = 320;
    }