aco/sched_ilp: new latency heuristic

The main train of thought is that we should consider latency after the write was scheduled. This means we rely a lot less on the input order of instructions for good results. Foz-DB GFX1150: Totals from 75606 (95.25% of 79377) affected shaders: Instrs: 43274326 -> 42129011 (-2.65%); split: -2.65%, +0.01% CodeSize: 223049932 -> 218465796 (-2.06%); split: -2.06%, +0.00% Latency: 297614199 -> 292317054 (-1.78%); split: -1.84%, +0.06% InvThroughput: 57020160 -> 56336213 (-1.20%); split: -1.21%, +0.02% VClause: 841775 -> 841861 (+0.01%); split: -0.06%, +0.07% SClause: 1253516 -> 1253798 (+0.02%); split: -0.03%, +0.05% VALU: 23893837 -> 23893828 (-0.00%); split: -0.00%, +0.00% Foz-DB Navi31: Totals from 75606 (95.25% of 79377) affected shaders: Instrs: 42717592 -> 41531696 (-2.78%); split: -2.78%, +0.00% CodeSize: 223582476 -> 218866196 (-2.11%); split: -2.11%, +0.00% Latency: 297736383 -> 292450493 (-1.78%); split: -1.83%, +0.05% InvThroughput: 47298730 -> 46934084 (-0.77%); split: -0.78%, +0.01% VClause: 844982 -> 844892 (-0.01%); split: -0.07%, +0.06% SClause: 1248433 -> 1248693 (+0.02%); split: -0.03%, +0.05% VALU: 24819703 -> 24819704 (+0.00%); split: -0.00%, +0.00% Foz-DB Navi21: Totals from 76224 (96.03% of 79377) affected shaders: Instrs: 46019515 -> 46015691 (-0.01%); split: -0.03%, +0.03% CodeSize: 246992544 -> 246977404 (-0.01%); split: -0.03%, +0.02% Latency: 324647457 -> 318661132 (-1.84%); split: -1.90%, +0.05% InvThroughput: 74834800 -> 74269723 (-0.76%); split: -0.76%, +0.01% VClause: 927601 -> 927579 (-0.00%); split: -0.04%, +0.04% SClause: 1302666 -> 1303178 (+0.04%); split: -0.02%, +0.06% Foz-DB Vega10: Totals from 60142 (95.42% of 63026) affected shaders: Instrs: 25117688 -> 25098175 (-0.08%); split: -0.10%, +0.02% CodeSize: 129847464 -> 129769456 (-0.06%); split: -0.08%, +0.02% Latency: 261606546 -> 262407481 (+0.31%); split: -0.12%, +0.43% InvThroughput: 138422594 -> 138500401 (+0.06%); split: -0.03%, +0.09% VClause: 555424 -> 555321 (-0.02%); split: -0.11%, +0.09% SClause: 851219 -> 851620 (+0.05%); split: -0.03%, +0.08% Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
2026-05-05 09:38:07 +02:00 · 2025-01-26 13:41:53 +01:00 · 2025-01-26 13:41:53 +01:00 · 819938d2fa
commit 819938d2fa
parent 0f13a42657
2 changed files with 58 additions and 39 deletions
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@ -6,6 +6,7 @@
 #include "aco_ir.h"

 #include "util/bitscan.h"
+#include "util/bitset.h"
 #include "util/macros.h"

 #include <limits>
@ -40,14 +41,15 @@ struct VOPDInfo {

 struct InstrInfo {
   Instruction* instr;
-   int32_t priority;
+   int16_t priority;
   mask_t dependency_mask;       /* bitmask of nodes which have to be scheduled before this node. */
+   mask_t write_for_read_mask;   /* bitmask of nodes in the DAG that have a RaW dependency. */
   uint8_t next_non_reorderable; /* index of next non-reorderable instruction node after this one. */
 };

 struct RegisterInfo {
   mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
-   int8_t latency;   /* estimated latency of last register write. */
+   int8_t latency;   /* estimated outstanding latency of last register write outside the DAG. */
   uint8_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
   uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
   uint8_t padding : 3;
@ -58,6 +60,7 @@ struct SchedILPContext {
   bool is_vopd = false;
   InstrInfo nodes[num_nodes];
   RegisterInfo regs[512];
+   BITSET_DECLARE(reg_has_latency, 512) = { 0 };
   mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
   mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
   uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
@ -318,6 +321,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
   InstrInfo& entry = ctx.nodes[idx];
   entry.instr = instr;
   entry.priority = 0;
+   entry.write_for_read_mask = 0;
   const mask_t mask = BITFIELD_BIT(idx);
   bool reorder = can_reorder(instr);
   ctx.active_mask |= mask;
@ -346,28 +350,12 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
         /* Add register reads. */
         reg_info.read_mask |= mask;

-         int cycles_since_reg_write = num_nodes;
         if (reg_info.has_direct_dependency) {
            /* A previous dependency is still part of the DAG. */
+            ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
            entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
-            cycles_since_reg_write = ctx.nodes[reg_info.direct_dependency].priority;
-         }
-
-         if (reg_info.latency) {
-            /* Ignore and reset register latencies for memory loads and other non-reorderable
-             * instructions. We schedule these as early as possible anyways.
-             */
-            if (reorder && reg_info.latency > cycles_since_reg_write) {
-               entry.priority = MIN2(entry.priority, cycles_since_reg_write - reg_info.latency);
-
-               /* If a previous register write created some latency, ensure that this
-                * is the first read of the register by making this instruction a direct
-                * dependency of all following register reads.
-                */
-               reg_info.has_direct_dependency = 1;
-               reg_info.direct_dependency = idx;
-            }
-            reg_info.latency = 0;
+         } else if (BITSET_TEST(ctx.reg_has_latency, reg + i)) {
+            entry.priority = MIN2(entry.priority, -reg_info.latency);
         }
      }
   }
@ -375,15 +363,19 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
   /* Check if this instructions reads implicit registers. */
   if (needs_exec_mask(instr)) {
      for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
-         if (ctx.regs[reg].has_direct_dependency)
+         if (ctx.regs[reg].has_direct_dependency) {
            entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+            ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
+         }
         ctx.regs[reg].read_mask |= mask;
      }
   }
   if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
      for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
-         if (ctx.regs[reg].has_direct_dependency)
+         if (ctx.regs[reg].has_direct_dependency) {
            entry.dependency_mask |= BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+            ctx.nodes[ctx.regs[reg].direct_dependency].write_for_read_mask |= mask;
+         }
         ctx.regs[reg].read_mask |= mask;
      }
   }
@ -400,11 +392,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
         /* This register write is a direct dependency for all following reads. */
         reg_info.has_direct_dependency = 1;
         reg_info.direct_dependency = idx;
-
-         if (!ctx.is_vopd) {
-            /* Add latency information for the next register read. */
-            reg_info.latency = get_latency(instr);
-         }
      }
   }

@ -447,9 +434,6 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
      /* Add transitive dependencies. */
      if (entry.dependency_mask & BITFIELD_BIT(i))
         entry.dependency_mask |= ctx.nodes[i].dependency_mask;
-
-      /* increment base priority */
-      ctx.nodes[i].priority++;
   }
 }

@ -459,6 +443,24 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
   const mask_t mask = ~BITFIELD_BIT(idx);
   ctx.active_mask &= mask;

+   int stall = 1; /* Assume all instructions take one cycle to issue. */
+   if (ctx.nodes[idx].priority < 0) {
+      /* Add remaining latency stall. */
+      stall -= ctx.nodes[idx].priority;
+   }
+
+   if (!ctx.is_vopd) {
+      unsigned i;
+      BITSET_FOREACH_SET (i, ctx.reg_has_latency, 512) {
+         if (ctx.regs[i].latency <= stall) {
+            ctx.regs[i].latency = 0;
+            BITSET_CLEAR(ctx.reg_has_latency, i);
+         } else {
+            ctx.regs[i].latency -= stall;
+         }
+      }
+   }
+
   for (const Operand& op : instr->operands) {
      const unsigned reg = op.physReg();
      if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
@ -467,7 +469,6 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
      for (unsigned i = 0; i < op.size(); i++) {
         RegisterInfo& reg_info = ctx.regs[reg + i];
         reg_info.read_mask &= mask;
-         reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
      }
   }
   if (needs_exec_mask(instr)) {
@ -478,16 +479,30 @@ remove_entry(SchedILPContext& ctx, const Instruction* const instr, const uint32_
      ctx.regs[flat_scr_lo].read_mask &= mask;
      ctx.regs[flat_scr_hi].read_mask &= mask;
   }
+
+   const int8_t latency = get_latency(instr);
+
   for (const Definition& def : instr->definitions) {
      for (unsigned i = 0; i < def.size(); i++) {
         unsigned reg = def.physReg().reg() + i;
         ctx.regs[reg].read_mask &= mask;
-         ctx.regs[reg].has_direct_dependency &= ctx.regs[reg].direct_dependency != idx;
+         if (ctx.regs[reg].has_direct_dependency && ctx.regs[reg].direct_dependency == idx) {
+            ctx.regs[reg].has_direct_dependency = false;
+            if (!ctx.is_vopd) {
+               BITSET_SET(ctx.reg_has_latency, reg);
+               ctx.regs[reg].latency = latency;
+            }
+         }
      }
   }

-   for (unsigned i = 0; i < num_nodes; i++)
+   for (unsigned i = 0; i < num_nodes; i++) {
      ctx.nodes[i].dependency_mask &= mask;
+      ctx.nodes[i].priority += stall;
+      if (ctx.nodes[idx].write_for_read_mask & BITFIELD_BIT(i) && !ctx.is_vopd) {
+         ctx.nodes[i].priority = MIN2(ctx.nodes[i].priority, -latency);
+      }
+   }

   if (ctx.next_non_reorderable == idx) {
      ctx.non_reorder_mask &= mask;
@ -782,10 +797,14 @@ schedule_ilp(Program* program)
   SchedILPContext ctx = {program};

   for (Block& block : program->blocks) {
+      if (block.instructions.empty())
+         continue;
      auto it = block.instructions.begin();
      auto insert_it = block.instructions.begin();
      do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
      block.instructions.resize(insert_it - block.instructions.begin());
+      if (block.linear_succs.empty() || block.instructions.back()->opcode == aco_opcode::s_branch)
+         BITSET_ZERO(ctx.reg_has_latency);
   }
 }

--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@ -83,8 +83,8 @@ BEGIN_TEST(d3d11_derivs.constant)
   //>> p_end_linear_vgpr (kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
   //>> v_mov_b32_e32 v#ry, -0.5                                                           ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                                         ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                                       ; $_
   //>> image_sample v[#_:#_], v[#rx:#ry], s[#_:#_], s[#_:#_] dmask:0xf dim:SQ_RSRC_IMG_2D ; $_ $_
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "Assembly");
@ -334,8 +334,8 @@ BEGIN_TEST(d3d11_derivs._1d_gfx9)
   //>> p_end_linear_vgpr (kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

-   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                ; $_
   //>> v_mov_b32_e32 v#ry, 0.5                                   ; $_
+   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                              ; $_
   //; success = rx+1 == ry
   //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf ; $_ $_
@ -376,9 +376,9 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
   //>> p_end_linear_vgpr (kill)%wqm
   pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");

+   //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
   //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
   //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
   //>> v_rndne_f32_e32 v#rl_tmp, v#rl_tmp                           ; $_
   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
   //>> v_mov_b32_e32 v#rl, v#rl_tmp                                 ; $_
@ -472,11 +472,11 @@ BEGIN_TEST(d3d11_derivs.cube_array)

   //>> v_cubeid_f32 v#rf, v#_, v#_, v#_                                                      ; $_ $_

+   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
   //>> v_fmaak_f32 v#rx_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
   //>> v_fmaak_f32 v#ry_tmp, v#_, v#_, 0x3fc00000                                            ; $_ $_
-   //>> v_fmamk_f32 v#rlf_tmp, v#rl, 0x41000000, v#rf                                         ; $_ $_
-   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp]                                       ; $_ $_
   //>> v_mov_b32_e32 v#rlf, v#rlf_tmp                                                        ; $_
+   //>> v_lshrrev_b64 v[#rx:#ry], 0, v[#rx_tmp:#ry_tmp]                                       ; $_ $_

   //>> BB1:
   //; success = rx+1 == ry and rx+2 == rlf