From 35ac517780caff8546bbd4fcff6f6398e231148a Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Wed, 16 Jul 2025 15:34:43 -0700
Subject: [PATCH] intel/brw/xe3+: Define BRW_SCHEDULE_PRE_LATENCY scheduling
 mode.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This defines a new pre-RA scheduling mode similar to BRW_SCHEDULE_PRE
but more aggressive at optimizing for minimum latency rather than
minimum register usage.  The main motivation is that on recent xe3
platforms we use a register allocation heuristic that packs variables
more tightly at the bottom of the register file instead of the
round-robin heuristic we used on previous platforms, since as a result
of VRT there is a parallelism penalty when a program uses more GRF
registers than necessary.  Unfortunately the xe3 tight-packing
heuristic severely constrains the work of the post-RA scheduler due to
the false dependencies introduced during register allocation, so we
can do a better job by making the scheduler aware of instruction
latencies before the register allocator introduces any false
dependencies.

This can lead to higher register pressure, but only when the scheduler
decides it could save cycles by extending a live range.  It makes
sense to preserve the preexisting BRW_SCHEDULE_PRE as a separate mode
since some workloads can still benefit from neglecting latencies
pre-RA due to the trade-off mentioned between parallelism and GRF use,
a future commit will introduce a more accurate estimate of the
expected relative performance of BRW_SCHEDULE_PRE
vs. BRW_SCHEDULE_PRE_LATENCY taking into account this trade-off.

In theory this could also be helpful on earlier pre-xe3 platforms, but
the benefit should be significantly smaller due to the different RA
heuristic so it hasn't been tested extensively pre-xe3.

The following Traci tests are improved significantly by this change on
PTL (nearly all tests that run on my system are affected positively):

Ghostrunner2-trace-dx11-1440p-ultra:                7.12% ±0.36%
SpaceEngineers-trace-dx11-2160p-high:               5.77% ±0.43%
HogwartsLegacy-trace-dx12-1080p-ultra:              4.40% ±0.03%
Naraka-trace-dx11-1440p-highest:                    3.06% ±0.43%
MetroExodus-trace-dx11-2160p-ultra:                 2.26% ±0.60%
Fortnite-trace-dx11-2160p-epix:                     2.12% ±0.53%
Nba2K23-trace-dx11-2160p-ultra:                     1.98% ±0.30%
Control-trace-dx11-1440p-high:                      1.93% ±0.36%
GodOfWar-trace-dx11-2160p-ultra:                    1.62% ±0.47%
TotalWarPharaoh-trace-dx11-1440p-ultra:             1.55% ±0.18%
MountAndBlade2-trace-dx11-1440p-veryhigh:           1.51% ±0.37%
Destiny2-trace-dx11-1440p-highest:                  1.44% ±0.34%
GtaV-trace-dx11-2160p-ultra:                        1.26% ±0.27%
ShadowTombRaider-trace-dx11-2160p-ultra:            1.10% ±0.58%
Borderlands3-trace-dx11-2160p-ultra:                0.95% ±0.43%
TerminatorResistance-trace-dx11-2160p-ultra:        0.87% ±0.22%
BaldursGate3-trace-dx11-1440p-ultra:                0.84% ±0.28%
CitiesSkylines2-trace-dx11-1440p-high:              0.82% ±0.22%
PubG-trace-dx11-1440p-ultra:                        0.72% ±0.37%
Palworld-trace-dx11-1080p-med:                      0.71% ±0.26%
Superposition-trace-dx11-2160p-extreme:             0.69% ±0.19%

The compile-time cost of shader-db increases significantly by 1.85%
after this commit (14 iterations, 5% significance), the compile-time
of fossil-db doesn't change significantly in my setup.

v2: Addressed interaction with 81594d0db180398f48634438c8c8b5b9ab6a,
    since the code that calculates deps, delays and exits is no longer
    mode-independent after this change.  Instead of reverting that
    commit (which is non-trivial and would have a greater compile-time
    hit) simply reconstruct the scheduler object during the transition
    between BRW_SCHEDULE_PRE_LATENCY and any other PRE mode that
    doesn't require instruction latencies.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36618>
---
 .../compiler/brw_schedule_instructions.cpp    | 43 +++++++++++++++----
 src/intel/compiler/brw_shader.cpp             |  5 +++
 src/intel/compiler/brw_shader.h               |  1 +
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
index 5c51893d4fb..b2fe12aca57 100644
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -587,7 +587,7 @@ schedule_node::set_latency(const struct brw_isa_info *isa)
 class brw_instruction_scheduler {
 public:
    brw_instruction_scheduler(void *mem_ctx, const brw_shader *s, int grf_count, int hw_reg_count,
-                         int block_count, bool post_reg_alloc);
+                             int block_count, bool post_reg_alloc, bool need_latencies);
 
    void add_barrier_deps(schedule_node *n);
    void add_cross_lane_deps(schedule_node *n);
@@ -706,8 +706,9 @@ public:
 };
 
 brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_shader *s,
-                                             int grf_count, int hw_reg_count,
-                                             int block_count, bool post_reg_alloc)
+                                                     int grf_count, int hw_reg_count,
+                                                     int block_count, bool post_reg_alloc,
+                                                     bool need_latencies)
    : s(s)
 {
    this->mem_ctx = mem_ctx;
@@ -724,7 +725,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
    foreach_block_and_inst(block, brw_inst, inst, s->cfg) {
       n->inst = inst;
 
-      if (!post_reg_alloc)
+      if (!need_latencies)
          n->latency = 1;
       else
          n->set_latency(isa);
@@ -742,7 +743,7 @@ brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const brw_sh
    current.available.make_empty();
 
    this->hw_reg_count = hw_reg_count;
-   this->mode = BRW_SCHEDULE_NONE;
+   this->mode = (need_latencies ? BRW_SCHEDULE_PRE_LATENCY : BRW_SCHEDULE_NONE);
    this->reg_pressure = 0;
 
    if (!post_reg_alloc) {
@@ -1571,7 +1572,7 @@ brw_instruction_scheduler::choose_instruction_to_schedule()
 {
    schedule_node *chosen = NULL;
 
-   if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) {
+   if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_PRE_LATENCY || mode == BRW_SCHEDULE_POST) {
       int chosen_time = 0;
 
       /* Of the instructions ready to execute or the closest to being ready,
@@ -1848,8 +1849,16 @@ brw_prepare_scheduler(brw_shader &s, void *mem_ctx)
    const int grf_count = s.alloc.count;
 
    brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler);
-   return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
-                                                s.cfg->num_blocks, /* post_reg_alloc */ false);
+   return new (empty) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, grf_count,
+                                                s.first_non_payload_grf, s.cfg->num_blocks,
+                                                /* post_reg_alloc */ false, s.devinfo->ver >= 30);
+}
+
+static bool
+needs_instruction_latencies(brw_instruction_scheduler_mode mode)
+{
+   return mode == BRW_SCHEDULE_PRE_LATENCY ||
+          mode == BRW_SCHEDULE_POST;
 }
 
 void
@@ -1859,6 +1868,21 @@ brw_schedule_instructions_pre_ra(brw_shader &s, brw_instruction_scheduler *sched
    if (mode == BRW_SCHEDULE_NONE)
       return;
 
+   if (needs_instruction_latencies(mode) != needs_instruction_latencies(sched->mode)) {
+      /* The new mode requires different instruction latencies, which
+       * requires recalculating the dependency graph as well as the
+       * delay and exit metadata.  Instead of maintaining a codepath
+       * to reset and recompute most of the scheduler data structure
+       * simply recreate the scheduler object.
+       */
+      void *mem_ctx = ralloc_parent(sched);
+      ralloc_free(sched->mem_ctx);
+      new (sched) brw_instruction_scheduler(ralloc_context(mem_ctx), &s, s.alloc.count,
+                                            s.first_non_payload_grf, s.cfg->num_blocks,
+                                            /* post_reg_alloc */ false,
+                                            needs_instruction_latencies(mode));
+   }
+
    sched->run(mode);
 
    s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
@@ -1868,12 +1892,13 @@ void
 brw_schedule_instructions_post_ra(brw_shader &s)
 {
    const bool post_reg_alloc = true;
+   const bool need_latencies = true;
    const int grf_count = reg_unit(s.devinfo) * s.grf_used;
 
    void *mem_ctx = ralloc_context(NULL);
 
    brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
-                                   s.cfg->num_blocks, post_reg_alloc);
+                                   s.cfg->num_blocks, post_reg_alloc, need_latencies);
    sched.run(BRW_SCHEDULE_POST);
 
    ralloc_free(mem_ctx);
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index 5bbb4b10bd1..360e9794363 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -1087,6 +1087,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
    bool allocated;
 
    static const enum brw_instruction_scheduler_mode pre_modes[] = {
+      BRW_SCHEDULE_PRE_LATENCY,
       BRW_SCHEDULE_PRE,
       BRW_SCHEDULE_PRE_NON_LIFO,
       BRW_SCHEDULE_NONE,
@@ -1094,6 +1095,7 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
    };
 
    static const char *scheduler_mode_name[] = {
+      [BRW_SCHEDULE_PRE_LATENCY] = "latency-sensitive",
       [BRW_SCHEDULE_PRE] = "top-down",
       [BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
       [BRW_SCHEDULE_PRE_LIFO] = "lifo",
@@ -1130,6 +1132,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
       enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
 
+      if (devinfo->ver < 30 && sched_mode == BRW_SCHEDULE_PRE_LATENCY)
+         continue;
+
       brw_schedule_instructions_pre_ra(s, sched, sched_mode);
       s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
 
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
index 44aa044c12a..3d6a0fd9511 100644
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@@ -286,6 +286,7 @@ void brw_calculate_cfg(brw_shader &s);
 void brw_optimize(brw_shader &s);
 
 enum brw_instruction_scheduler_mode {
+   BRW_SCHEDULE_PRE_LATENCY,
    BRW_SCHEDULE_PRE,
    BRW_SCHEDULE_PRE_NON_LIFO,
    BRW_SCHEDULE_PRE_LIFO,