aco: add new post-RA scheduler for ILP

Totals from 77247 (97.37% of 79330) affected shaders: (GFX11) Instrs: 44371374 -> 43215723 (-2.60%); split: -2.64%, +0.03% CodeSize: 227819532 -> 223188224 (-2.03%); split: -2.06%, +0.03% Latency: 301016823 -> 290147626 (-3.61%); split: -3.70%, +0.09% InvThroughput: 48551749 -> 47646212 (-1.87%); split: -1.88%, +0.01% VClause: 870581 -> 834655 (-4.13%); split: -4.13%, +0.00% SClause: 1487061 -> 1340851 (-9.83%); split: -9.83%, +0.00% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25676>
2026-01-03 07:10:15 +01:00 · 2023-10-12 10:52:45 +02:00 · 2023-10-12 10:52:45 +02:00 · 023e78b4d7
commit 023e78b4d7
parent 72a5c659d4
7 changed files with 444 additions and 15 deletions
--- a/src/amd/compiler/README.md
+++ b/src/amd/compiler/README.md
@ -97,6 +97,10 @@ The next step is a pass out of SSA by inserting parallelcopies at the end of blo
 Most pseudo instructions are lowered to actual machine instructions.
 These are mostly parallel copy instructions created by instruction selection or register allocation and spill/reload code.

+#### ILP Scheduling
+
+This second scheduler works on registers rather than SSA-values to determine dependencies. It implements a forward list scheduling algorithm using a partial dependency graph of few instructions at a time and aims to create larger memory clauses and improve ILP.
+
 #### Insert wait states

 GCN requires some wait states to be manually inserted in order to ensure correct behavior on memory instructions and some register dependencies.
@ -249,7 +253,8 @@ We also have `ACO_DEBUG` options:
 * `force-waitcnt` - Forces ACO to emit a wait state after each instruction when there is something to wait for. Harms performance.
 * `novn` - Disables the ACO value numbering stage.
 * `noopt` - Disables the ACO optimizer.
-* `nosched` - Disables the ACO scheduler.
+* `nosched` - Disables the ACO pre-RA and post-RA scheduler.
+* `nosched-ilp` - Disables the ACO post-RA ILP scheduler.

 Note that you need to **combine these options into a comma-separated list**, for example: `RADV_DEBUG=nocache,shaders` otherwise only the last one will take effect. (This is how all environment variables work, yet this is an often made mistake.) Example:

--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@ -199,6 +199,10 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
   aco::lower_to_hw_instr(program.get());
   validate(program.get());

+   /* Schedule hardware instructions for ILP */
+   if (!options->optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED_ILP))
+      aco::schedule_ilp(program.get());
+
   /* Insert Waitcnt */
   aco::insert_wait_states(program.get());
   aco::insert_NOPs(program.get());
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -36,18 +36,20 @@ thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;

 uint64_t debug_flags = 0;

-static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
-                                                         {"validatera", DEBUG_VALIDATE_RA},
-                                                         {"novalidateir", DEBUG_NO_VALIDATE_IR},
-                                                         {"perfwarn", DEBUG_PERFWARN},
-                                                         {"force-waitcnt", DEBUG_FORCE_WAITCNT},
-                                                         {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
-                                                         {"novn", DEBUG_NO_VN},
-                                                         {"noopt", DEBUG_NO_OPT},
-                                                         {"nosched", DEBUG_NO_SCHED},
-                                                         {"perfinfo", DEBUG_PERF_INFO},
-                                                         {"liveinfo", DEBUG_LIVE_INFO},
-                                                         {NULL, 0}};
+static const struct debug_control aco_debug_options[] = {
+   {"validateir", DEBUG_VALIDATE_IR},
+   {"validatera", DEBUG_VALIDATE_RA},
+   {"novalidateir", DEBUG_NO_VALIDATE_IR},
+   {"perfwarn", DEBUG_PERFWARN},
+   {"force-waitcnt", DEBUG_FORCE_WAITCNT},
+   {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
+   {"novn", DEBUG_NO_VN},
+   {"noopt", DEBUG_NO_OPT},
+   {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP},
+   {"nosched-ilp", DEBUG_NO_SCHED_ILP},
+   {"perfinfo", DEBUG_PERF_INFO},
+   {"liveinfo", DEBUG_LIVE_INFO},
+   {NULL, 0}};

 static once_flag init_once_flag = ONCE_FLAG_INIT;

--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -57,6 +57,7 @@ enum {
   DEBUG_LIVE_INFO = 0x100,
   DEBUG_FORCE_WAITDEPS = 0x200,
   DEBUG_NO_VALIDATE_IR = 0x400,
+   DEBUG_NO_SCHED_ILP = 0x800,
 };

 enum storage_class : uint8_t {
@ -2207,6 +2208,7 @@ void register_allocation(Program* program, std::vector<IDSet>& live_out_per_bloc
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
+void schedule_ilp(Program* program);
 void spill(Program* program, live& live_vars);
 void insert_wait_states(Program* program);
 bool dealloc_vgprs(Program* program);
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@ -0,0 +1,415 @@
+/*
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "aco_ir.h"
+
+#include "util/bitscan.h"
+#include "util/macros.h"
+
+#include <limits>
+
+/*
+ * This pass implements a simple forward list-scheduler which works on a small
+ * partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
+ * entirely freely. Memory load instructions must be kept in-order and any other
+ * instruction must not be re-scheduled at all.
+ *
+ * The main goal of this scheduler is to create more memory clauses, schedule
+ * memory loads early, and to improve ALU instruction level parallelism.
+ */
+
+namespace aco {
+namespace {
+
+constexpr unsigned num_nodes = 16;
+using mask_t = uint16_t;
+static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
+
+struct InstrInfo {
+   Instruction* instr;
+   int32_t priority;
+   mask_t dependency_mask;       /* bitmask of nodes which have to be scheduled before this node. */
+   uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
+   bool potential_clause; /* indicates that this instruction is not (yet) immediately followed by a
+                             reorderable instruction. */
+};
+
+struct RegisterInfo {
+   mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
+   int8_t latency;   /* estimated latency of last register write. */
+   uint8_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
+   uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
+   uint8_t padding : 3;
+};
+
+struct SchedILPContext {
+   Program* program;
+   InstrInfo nodes[num_nodes];
+   RegisterInfo regs[512];
+   mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
+   mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
+   uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
+   uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
+};
+
+/**
+ * Returns true for side-effect free SALU and VALU instructions.
+ */
+bool
+can_reorder(const Instruction* const instr)
+{
+   if (instr->isVALU())
+      return true;
+   if (!instr->isSALU() || instr->isSOPP())
+      return false;
+
+   switch (instr->opcode) {
+   /* SOP2 */
+   case aco_opcode::s_cbranch_g_fork:
+   case aco_opcode::s_rfe_restore_b64:
+   /* SOP1 */
+   case aco_opcode::s_setpc_b64:
+   case aco_opcode::s_swappc_b64:
+   case aco_opcode::s_rfe_b64:
+   case aco_opcode::s_cbranch_join:
+   case aco_opcode::s_set_gpr_idx_idx:
+   case aco_opcode::s_sendmsg_rtn_b32:
+   case aco_opcode::s_sendmsg_rtn_b64:
+   /* SOPK */
+   case aco_opcode::s_cbranch_i_fork:
+   case aco_opcode::s_getreg_b32:
+   case aco_opcode::s_setreg_b32:
+   case aco_opcode::s_setreg_imm32_b32:
+   case aco_opcode::s_call_b64:
+   case aco_opcode::s_waitcnt_vscnt:
+   case aco_opcode::s_waitcnt_vmcnt:
+   case aco_opcode::s_waitcnt_expcnt:
+   case aco_opcode::s_waitcnt_lgkmcnt:
+   case aco_opcode::s_subvector_loop_begin:
+   case aco_opcode::s_subvector_loop_end:
+   /* SOPC */
+   case aco_opcode::s_setvskip:
+   case aco_opcode::s_set_gpr_idx_on: return false;
+   default: break;
+   }
+
+   return true;
+}
+
+unsigned
+get_latency(const Instruction* const instr)
+{
+   /* Note, that these are not accurate latency estimations. */
+   if (instr->isVALU())
+      return 5;
+   if (instr->isSALU())
+      return 2;
+   if (instr->isVMEM() || instr->isFlatLike())
+      return 32;
+   if (instr->isSMEM())
+      return 5;
+   if (instr->accessesLDS())
+      return 2;
+
+   return 0;
+}
+
+bool
+is_memory_instr(const Instruction* const instr)
+{
+   /* For memory instructions, we allow to reorder them with ALU if it helps
+    * to form larger clauses or to increase def-use distances.
+    */
+   return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || instr->accessesLDS();
+}
+
+constexpr unsigned max_sgpr = 128;
+constexpr unsigned min_vgpr = 256;
+
+void
+add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
+{
+   InstrInfo& entry = ctx.nodes[idx];
+   entry.instr = instr;
+   entry.priority = 0;
+   const mask_t mask = BITFIELD_BIT(idx);
+   bool reorder = can_reorder(instr);
+   ctx.active_mask |= mask;
+
+   for (const Operand& op : instr->operands) {
+      assert(op.isFixed());
+      unsigned reg = op.physReg();
+      if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
+         reorder &= reg != pops_exiting_wave_id;
+         continue;
+      }
+
+      for (unsigned i = 0; i < op.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[reg + i];
+
+         /* Add register reads. */
+         reg_info.read_mask |= mask;
+
+         int cycles_since_reg_write = num_nodes;
+         if (reg_info.has_direct_dependency) {
+            /* A previous dependency is still part of the DAG. */
+            entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
+            cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
+         }
+
+         if (reg_info.latency) {
+            /* Ignore and reset register latencies for memory loads and other non-reorderable
+             * instructions. We schedule these as early as possible anyways.
+             */
+            if (reorder && reg_info.latency > cycles_since_reg_write) {
+               entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
+
+               /* If a previous register write created some latency, ensure that this
+                * is the first read of the register by making this instruction a direct
+                * dependency of all following register reads.
+                */
+               reg_info.has_direct_dependency = 1;
+               reg_info.direct_dependency = idx;
+            }
+            reg_info.latency = 0;
+         }
+      }
+   }
+
+   /* Check if this instructions reads implicit registers. */
+   if (needs_exec_mask(instr)) {
+      for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
+         if (ctx.regs[reg].has_direct_dependency)
+            entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+         ctx.regs[reg].read_mask |= mask;
+      }
+   }
+   if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
+      for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
+         if (ctx.regs[reg].has_direct_dependency)
+            entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+         ctx.regs[reg].read_mask |= mask;
+      }
+   }
+
+   for (const Definition& def : instr->definitions) {
+      for (unsigned i = 0; i < def.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
+
+         /* Add all previous register reads and writes to the dependencies. */
+         entry.dependency_mask |= reg_info.read_mask;
+         reg_info.read_mask = mask;
+
+         /* This register write is a direct dependency for all following reads. */
+         reg_info.has_direct_dependency = 1;
+         reg_info.direct_dependency = idx;
+
+         /* Add latency information for the next register read. */
+         reg_info.latency = get_latency(instr);
+      }
+   }
+
+   if (!reorder) {
+      ctx.non_reorder_mask |= mask;
+
+      /* Set this node as last non-reorderable instruction */
+      if (ctx.next_non_reorderable == UINT8_MAX) {
+         ctx.next_non_reorderable = idx;
+      } else {
+         ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
+      }
+      ctx.last_non_reorderable = idx;
+      entry.next_non_reorderable = UINT8_MAX;
+
+      /* Just don't reorder these at all. */
+      if (!is_memory_instr(instr) || instr->definitions.empty() ||
+          get_sync_info(instr).semantics & semantic_volatile) {
+         /* Add all previous instructions as dependencies. */
+         entry.dependency_mask = ctx.active_mask;
+      }
+
+      /* Remove non-reorderable instructions from dependencies, since WaR dependencies can interfere
+       * with clause formation. This should be fine, since these are always scheduled in-order and
+       * any cases that are actually a concern for clause formation are added as transitive
+       * dependencies. */
+      entry.dependency_mask &= ~ctx.non_reorder_mask;
+      entry.potential_clause = true;
+   } else if (ctx.last_non_reorderable != UINT8_MAX) {
+      ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
+   }
+
+   entry.dependency_mask &= ~mask;
+
+   for (unsigned i = 0; i < num_nodes; i++) {
+      if (!ctx.nodes[i].instr || i == idx)
+         continue;
+
+      /* Add transitive dependencies. */
+      if (entry.dependency_mask & BITFIELD_BIT(i))
+         entry.dependency_mask |= ctx.nodes[i].dependency_mask;
+
+      /* increment base priority */
+      ctx.nodes[i].priority++;
+   }
+}
+
+void
+remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_t idx)
+{
+   const mask_t mask = ~BITFIELD_BIT(idx);
+   ctx.active_mask &= mask;
+
+   for (const Operand& op : instr->operands) {
+      const unsigned reg = op.physReg();
+      if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
+         continue;
+
+      for (unsigned i = 0; i < op.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[reg + i];
+         reg_info.read_mask &= mask;
+         reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
+      }
+   }
+   if (needs_exec_mask(instr)) {
+      ctx.regs[exec_lo].read_mask &= mask;
+      ctx.regs[exec_hi].read_mask &= mask;
+   }
+   if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
+      ctx.regs[flat_scr_lo].read_mask &= mask;
+      ctx.regs[flat_scr_hi].read_mask &= mask;
+   }
+   for (const Definition& def : instr->definitions) {
+      for (unsigned i = 0; i < def.size(); i++) {
+         unsigned reg = def.physReg().reg() + i;
+         ctx.regs[reg].read_mask &= mask;
+         ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
+      }
+   }
+
+   for (unsigned i = 0; i < num_nodes; i++)
+      ctx.nodes[i].dependency_mask &= mask;
+
+   if (ctx.next_non_reorderable == idx) {
+      ctx.non_reorder_mask &= mask;
+      ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
+      if (ctx.last_non_reorderable == idx)
+         ctx.last_non_reorderable = UINT8_MAX;
+   }
+}
+
+/**
+ * Returns a bitfield of nodes which have to be scheduled before the
+ * next non-reorderable instruction.
+ * If the next non-reorderable instruction can form a clause, returns the
+ * dependencies of the entire clause.
+ */
+mask_t
+collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask_t clause_mask)
+{
+   const InstrInfo& entry = ctx.nodes[next];
+   mask_t dependencies = entry.dependency_mask;
+   clause_mask |= (entry.potential_clause << next);
+
+   if (!is_memory_instr(entry.instr))
+      return dependencies;
+
+   /* If this is potentially an "open" clause, meaning that the clause might
+    * consist of instruction not yet added to the DAG, consider all previous
+    * instructions as dependencies. This prevents splitting of larger, already
+    * formed clauses.
+    */
+   if (next == ctx.last_non_reorderable && entry.potential_clause)
+      return (~clause_mask & ctx.active_mask) | dependencies;
+
+   if (entry.next_non_reorderable == UINT8_MAX)
+      return dependencies;
+
+   /* Check if this can form a clause with the following non-reorderable instruction */
+   if (should_form_clause(entry.instr, ctx.nodes[entry.next_non_reorderable].instr)) {
+      mask_t clause_deps =
+         collect_clause_dependencies(ctx, entry.next_non_reorderable, clause_mask);
+
+      /* if the following clause is independent from us, add their dependencies */
+      if (!(clause_deps & BITFIELD_BIT(next)))
+         dependencies |= clause_deps;
+   }
+
+   return dependencies;
+}
+
+/**
+ * Returns the index of the next instruction to be selected.
+ */
+unsigned
+select_instruction(const SchedILPContext& ctx)
+{
+   mask_t mask = ctx.active_mask;
+
+   /* First, collect all dependencies of the next non-reorderable instruction(s).
+    * These make up the list of possible candidates.
+    */
+   if (ctx.next_non_reorderable != UINT8_MAX)
+      mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
+
+   /* If the next non-reorderable instruction has no dependencies, select it */
+   if (mask == 0)
+      return ctx.next_non_reorderable;
+
+   /* Otherwise, select the instruction with highest priority of all candidates. */
+   unsigned idx = -1u;
+   int32_t priority = INT32_MIN;
+   u_foreach_bit (i, mask) {
+      const InstrInfo& candidate = ctx.nodes[i];
+
+      /* Check if the candidate has pending dependencies. */
+      if (candidate.dependency_mask)
+         continue;
+
+      if (idx == -1u || candidate.priority > priority) {
+         idx = i;
+         priority = candidate.priority;
+      }
+   }
+
+   assert(idx != -1u);
+   return idx;
+}
+
+} // namespace
+
+void
+schedule_ilp(Program* program)
+{
+   SchedILPContext ctx = {program};
+
+   for (Block& block : program->blocks) {
+      auto it = block.instructions.begin();
+      for (unsigned i = 0; i < num_nodes; i++) {
+         if (it == block.instructions.end())
+            break;
+
+         add_entry(ctx, (it++)->get(), i);
+      }
+
+      auto insert_it = block.instructions.begin();
+      while (insert_it != block.instructions.end()) {
+         unsigned next_idx = select_instruction(ctx);
+         Instruction* next_instr = ctx.nodes[next_idx].instr;
+         remove_entry(ctx, next_instr, next_idx);
+         (insert_it++)->reset(next_instr);
+         ctx.nodes[next_idx].instr = NULL;
+
+         if (it != block.instructions.end()) {
+            add_entry(ctx, (it++)->get(), next_idx);
+         } else if (ctx.last_non_reorderable != UINT8_MAX) {
+            ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
+            ctx.last_non_reorderable = UINT8_MAX;
+         }
+      }
+      assert(it == block.instructions.end());
+   }
+}
+
+} // namespace aco
--- a/src/amd/compiler/meson.build
+++ b/src/amd/compiler/meson.build
@ -75,6 +75,7 @@ libaco_files = files(
  'aco_print_ir.cpp',
  'aco_reindex_ssa.cpp',
  'aco_scheduler.cpp',
+  'aco_scheduler_ilp.cpp',
  'aco_spill.cpp',
  'aco_ssa_elimination.cpp',
  'aco_statistics.cpp',
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@ -398,10 +398,10 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

   //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
+   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
   //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rl
   //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf da ; $_ $_