jay: schedule for pressure

Implement a simple pre-RA bottom-up list scheduler with the goal of decreasing register pressure. On Xe2, this significantly reduces spilling. SSA form allows us to estimate register demand cheaply and accurately, which theoretically [1] gives this algorithm the two Hippocratic properties: 1. Shaders with low register pressure are unaffected. 2. Register pressure can only be decreased, never increased. In other words: first, do no harm. The heuristic itself is very simple: greedily choose instructions that decrease liveness using a backwards list scheduler. This is far from optimal! But thanks to the above properties, even a heuristic that picked random instructions would be a win overall - by construction, we can only ever win. In other words: this scheduler is your older brother powering off the game console any time he's about to lose a game, maintaining a 100% win rate. [1] In reality, neither property is strictly satisfied due to the messy details of mapping our clean logical model onto Intel's many weird physical register files. Nevertheless, the algorithm is well-motivated and the empirical results on Xe2 are excellent. SIMD16: Totals: Instrs: 2754194 -> 2753957 (-0.01%); split: -0.23%, +0.22% CodeSize: 41094768 -> 41092768 (-0.00%); split: -0.23%, +0.23% Number of spill instructions: 1724 -> 1129 (-34.51%) Number of fill instructions: 1912 -> 1119 (-41.47%) Totals from 168 (6.35% of 2647) affected shaders: Instrs: 850994 -> 850757 (-0.03%); split: -0.75%, +0.73% CodeSize: 12825680 -> 12823680 (-0.02%); split: -0.74%, +0.73% Number of spill instructions: 1724 -> 1129 (-34.51%) Number of fill instructions: 1912 -> 1119 (-41.47%) SIMD32: Totals: Instrs: 4688858 -> 4557800 (-2.80%); split: -3.53%, +0.74% CodeSize: 70177200 -> 68214816 (-2.80%); split: -3.53%, +0.74% Number of spill instructions: 50316 -> 45795 (-8.99%); split: -9.56%, +0.57% Number of fill instructions: 51526 -> 45075 (-12.52%); split: -13.23%, +0.71% Totals from 819 (30.94% of 2647) affected shaders: Instrs: 3810182 -> 3679124 (-3.44%); split: -4.35%, +0.91% CodeSize: 57044000 -> 55081616 (-3.44%); split: -4.35%, +0.91% Number of spill instructions: 49264 -> 44743 (-9.18%); split: -9.76%, +0.58% Number of fill instructions: 50182 -> 43731 (-12.86%); split: -13.58%, +0.73% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41688>
2026-05-23 23:48:18 +02:00 · 2026-05-13 13:23:08 -04:00 · 2026-05-13 13:23:08 -04:00 · bc22a37d98
commit bc22a37d98
parent 81e21a8756
8 changed files with 462 additions and 0 deletions
--- a/src/intel/compiler/jay/jay_dag.c
+++ b/src/intel/compiler/jay/jay_dag.c
@ -0,0 +1,85 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2019 Broadcom
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_dag.h"
+#include <stdint.h>
+#include "util/ralloc.h"
+#include "util/u_dynarray.h"
+
+void
+jay_dag_init(struct jay_dag *dag, void *memctx, uint32_t node_count)
+{
+   assert(node_count >= 1 && "node 0 is reserved and always present");
+
+   *dag = (struct jay_dag) {
+      .adjacency = rzalloc_array(memctx, uint32_t, node_count),
+      .parent_counts = rzalloc_array(memctx, uint32_t, node_count),
+      .node_count = node_count,
+      .node = 1,
+   };
+
+   util_dynarray_init(&dag->heads, memctx);
+   util_dynarray_init(&dag->edges, memctx);
+}
+
+void
+jay_dag_add_edge(struct jay_dag *dag, uint32_t child)
+{
+   if (child && child != dag->node) {
+      assert(child < dag->node_count);
+
+      /* We have to prune degenerate or duplicate edges */
+      for (uint32_t i = (dag->node > 0 ? dag->adjacency[dag->node - 1] : 0);
+           i < util_dynarray_num_elements(&dag->edges, uint32_t); ++i) {
+         if (*util_dynarray_element(&dag->edges, uint32_t, i) == child)
+            return;
+      }
+
+      util_dynarray_append(&dag->edges, child);
+      dag->parent_counts[child]++;
+   }
+}
+
+void
+jay_dag_next_node(struct jay_dag *dag)
+{
+   assert(dag->node < dag->node_count);
+
+   dag->adjacency[dag->node++] =
+      util_dynarray_num_elements(&dag->edges, uint32_t);
+}
+
+void
+jay_dag_finalize(struct jay_dag *dag, uint32_t first_node)
+{
+   for (uint32_t i = dag->node - 1; i >= first_node; --i) {
+      if (dag->parent_counts[i] == 0) {
+         util_dynarray_append(&dag->heads, i);
+      }
+   }
+}
+
+/**
+ * Removes a DAG head from the graph, and moves any new dag heads into the
+ * heads list.
+ */
+void
+jay_dag_prune_head(struct jay_dag *dag, uint32_t head)
+{
+   assert(!dag->parent_counts[head]);
+   util_dynarray_delete_unordered(&dag->heads, uint32_t, head);
+   uint32_t first = head > 0 ? dag->adjacency[head - 1] : 0;
+
+   for (unsigned i = first; i < dag->adjacency[head]; ++i) {
+      uint32_t *it = util_dynarray_element(&dag->edges, uint32_t, i);
+
+      if ((--dag->parent_counts[*it]) == 0) {
+         util_dynarray_append(&dag->heads, *it);
+      }
+   }
+}
+
+
--- a/src/intel/compiler/jay/jay_dag.h
+++ b/src/intel/compiler/jay/jay_dag.h
@ -0,0 +1,22 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2019 Broadcom
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "util/u_dynarray.h"
+
+struct jay_dag {
+   struct util_dynarray heads, edges;
+   uint32_t *parent_counts;
+   uint32_t *adjacency;
+   uint32_t node, node_count;
+};
+
+void jay_dag_init(struct jay_dag *dag, void *memctx, uint32_t node_count);
+void jay_dag_prune_head(struct jay_dag *dag, uint32_t head);
+void jay_dag_add_edge(struct jay_dag *dag, uint32_t child);
+void jay_dag_finalize(struct jay_dag *dag, uint32_t first_node);
+void jay_dag_next_node(struct jay_dag *dag);
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@ -41,6 +41,7 @@ static const struct debug_named_value jay_debug_options[] = {
   { "spill",       JAY_DBG_SPILL,       "Shrink register file to test spilling" },
   { "sync",        JAY_DBG_SYNC,        "Sync after every instruction"          },
   { "noacc",       JAY_DBG_NOACC,       "Disable accumulator substitution"      },
+   { "nosched",     JAY_DBG_NOSCHED,     "Disable scheduling"                    },
   DEBUG_NAMED_VALUE_END
 };

@ -2705,6 +2706,10 @@ jay_compile(const struct intel_device_info *devinfo,
      jay_print(stdout, s);
   }

+   if (!(jay_debug & JAY_DBG_NOSCHED)) {
+      JAY_PASS(s, jay_schedule_pressure);
+   }
+
   JAY_PASS(s, jay_assign_flags);
   if (!(jay_debug & JAY_DBG_NOOPT)) {
      JAY_PASS(s, jay_opt_dead_code);
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@ -1098,6 +1098,10 @@ typedef struct jay_block {

   /** Pretty printing based on original structured control flow */
   uint8_t indent;
+
+   /* Register demand metadata calculated for scheduling use */
+   unsigned demand_max[JAY_NUM_SSA_FILES];
+   unsigned demand_out[JAY_NUM_SSA_FILES];
 } jay_block;

 static inline jay_block *
--- a/src/intel/compiler/jay/jay_liveness.c
+++ b/src/intel/compiler/jay/jay_liveness.c
@ -211,6 +211,11 @@ jay_calculate_register_demands(jay_function *func)
            jay_print_inst(stdout, I);
         }
      }
+
+      jay_foreach_ssa_file(f) {
+         block->demand_max[f] = max_demand[f];
+         block->demand_out[f] = demands[f];
+      }
   }

   free(files);
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@ -17,6 +17,7 @@ extern "C" {
 #define JAY_DBG_SPILL       BITFIELD_BIT(2)
 #define JAY_DBG_SYNC        BITFIELD_BIT(3)
 #define JAY_DBG_NOACC       BITFIELD_BIT(4)
+#define JAY_DBG_NOSCHED     BITFIELD_BIT(5)
 extern int jay_debug;

 bool jay_nir_lower_bool(nir_shader *nir);
@ -70,6 +71,8 @@ void jay_opt_propagate_backwards(jay_shader *s);
 void jay_opt_dead_code(jay_shader *s);
 void jay_opt_predicate(jay_shader *s);

+void jay_schedule_pressure(jay_shader *s);
+
 void jay_lower_pre_ra(jay_shader *s);
 void jay_lower_post_ra(jay_shader *s);
 void jay_lower_spill(jay_function *func);
--- a/src/intel/compiler/jay/jay_schedule.c
+++ b/src/intel/compiler/jay/jay_schedule.c
@ -0,0 +1,336 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2022 Collabora Ltd.
+ * Copyright 2019 Broadcom
+ * SPDX-License-Identifier: MIT
+ */
+
+/*
+ * This file implements a simple pre-RA bottom-up list scheduler with the goal
+ * of decreasing register pressure. On Xe2, this significantly reduces spilling.
+ *
+ * SSA form allows us to estimate register demand cheaply and accurately, which
+ * theoretically [1] gives this algorithm the two Hippocratic properties:
+ *
+ * 1. Shaders with low register pressure are unaffected.
+ * 2. Register pressure can only be decreased, never increased.
+ *
+ * In other words: first, do no harm.
+ *
+ * The heuristic itself is very simple: greedily choose instructions that
+ * decrease liveness using a backwards list scheduler. This is far from optimal!
+ * But thanks to the above properties, even a heuristic that picked random
+ * instructions would be a win overall - by construction, we can only ever win.
+ *
+ * [1] In reality, neither property is strictly satisfied due to the messy
+ * details of mapping our clean logical model onto Intel's many weird physical
+ * register files. Nevertheless, the algorithm is well-motivated and the
+ * empirical results on Xe2 are excellent.
+ */
+
+#include "util/bitset.h"
+#include "util/ralloc.h"
+#include "util/sparse_bitset.h"
+#include "util/u_dynarray.h"
+#include "jay_builder.h"
+#include "jay_dag.h"
+#include "jay_ir.h"
+#include "jay_opcodes.h"
+#include "jay_private.h"
+
+struct sched_ctx {
+   struct jay_dag dag;
+   unsigned dispatch_width;
+   jay_inst **insts;
+   struct u_sparse_bitset live;
+   BITSET_WORD *seen;
+};
+
+/* Cut down version of the function in jay_liveness.c */
+static void
+liveness_update(struct u_sparse_bitset *live, jay_inst *I)
+{
+   jay_foreach_dst_index(I, _, def) {
+      u_sparse_bitset_clear(live, def);
+   }
+
+   jay_foreach_src_index(I, _, comp, index) {
+      u_sparse_bitset_set(live, index);
+   }
+}
+
+static void
+populate_dag(struct sched_ctx *ctx,
+             jay_function *func,
+             jay_block *block,
+             uint32_t *def)
+{
+   uint32_t first_node_in_this_block = ctx->dag.node;
+
+   /* TODO: Reorder memory instructions */
+   uint32_t sidefx = 0, address = 0;
+
+   jay_foreach_inst_in_block(block, I) {
+      if (jay_op_starts_block(I->op)) {
+         continue;
+      } else if (jay_op_ends_block(I->op)) {
+         break;
+      }
+
+      /* Uses depend on definitions. SSA form forbids WaR and WaW hazards */
+      jay_foreach_src_index(I, s, c, index) {
+         if (def[index] && def[index] >= first_node_in_this_block) {
+            jay_dag_add_edge(&ctx->dag, def[index]);
+         }
+      }
+
+      jay_foreach_dst_index(I, d, index) {
+         def[index] = ctx->dag.node;
+      }
+
+      /* Serialize address register access until we have an address RA */
+      bool use_a0 = I->dst.file == J_ADDRESS || I->op == JAY_OPCODE_SHUFFLE;
+      jay_foreach_src(I, s) {
+         use_a0 |= I->src[s].file == J_ADDRESS;
+      }
+
+      if (use_a0) {
+         jay_dag_add_edge(&ctx->dag, address);
+         address = ctx->dag.node;
+      }
+
+      /* Serialize side effects for now */
+      if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) ||
+          I->op == JAY_OPCODE_SCHEDULE_BARRIER) {
+
+         jay_dag_add_edge(&ctx->dag, sidefx);
+         sidefx = ctx->dag.node;
+      }
+
+      ctx->insts[ctx->dag.node] = I;
+      jay_dag_next_node(&ctx->dag);
+   }
+
+   jay_dag_finalize(&ctx->dag, first_node_in_this_block);
+}
+
+/*
+ * Due to multiple register files, register demand is a vector. Our dynamic
+ * register file partitioning justifies modelling demand as a single scalar,
+ * where each file has a weight determined here.
+ */
+static unsigned
+scale(struct sched_ctx *ctx, jay_def x)
+{
+   return x.file == J_ADDRESS ? 0 : jay_is_uniform(x) ? 1 : ctx->dispatch_width;
+}
+
+/*
+ * Calculate the change in register pressure from scheduling a given
+ * instuction. Based on jay_calculate_register_demands, but without the use of
+ * kill-bits since we are reordering instructions.
+ */
+static signed
+calculate_pressure_delta_before(struct sched_ctx *ctx, jay_inst *I)
+{
+   signed delta = 0;
+
+   /* Make destinations live */
+   jay_foreach_dst(I, dst) {
+      delta += util_next_power_of_two(jay_num_values(dst)) * scale(ctx, dst);
+   }
+
+   return delta;
+}
+
+static signed
+calculate_pressure_delta_after(struct sched_ctx *ctx, jay_inst *I)
+{
+   signed delta = 0;
+   unsigned counter = 0;
+
+   /* Dead destinations are those written by the instruction but killed
+    * immediately after the instruction finishes.
+    */
+   jay_foreach_dst_index(I, _, index) {
+      delta -= !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->dst);
+   }
+
+   jay_foreach_dst(I, d) {
+      unsigned n = jay_num_values(d);
+      delta -= (util_next_power_of_two(n) - n) * scale(ctx, I->dst);
+   }
+
+   /* Late-kill sources. We precomputed the deduplication info and stashed it in
+    * the I->last_use bitfield for convenience.
+    */
+   jay_foreach_src_index(I, s, c, index) {
+      if (BITSET_TEST(I->last_use, counter)) {
+         delta -=
+            !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->src[s]);
+      }
+
+      counter++;
+   }
+
+   return delta;
+}
+
+/*
+ * Choose the next instuction, bottom-up. For now we use a simple greedy
+ * heuristic: choose the instuction that has the best effect on liveness.
+ */
+static uint32_t
+choose_inst(struct sched_ctx *s)
+{
+   int32_t min_delta = INT32_MAX;
+   uint32_t best = 0;
+
+   util_dynarray_foreach(&s->dag.heads, uint32_t, head) {
+      jay_inst *I = s->insts[*head];
+      int32_t delta = -(calculate_pressure_delta_after(s, I) +
+                        calculate_pressure_delta_before(s, I));
+
+      /* As a tiebreaker (only), sink flag writes to reduce specifically flag
+       * pressure, because spilling flags costs extra instructions and GPR
+       * pressure. This is a mildly positive heuristic.
+       */
+      delta *= 2;
+      if (jay_is_null(I->cond_flag)) {
+         delta++;
+      }
+
+      if (delta <= min_delta) {
+         best = *head;
+         min_delta = delta;
+      }
+   }
+
+   return best;
+}
+
+static void
+pressure_schedule_block(jay_function *func,
+                        jay_block *block,
+                        struct util_dynarray *schedule,
+                        struct sched_ctx *s,
+                        void *memctx)
+{
+   /* Our pressure calculations are all off by a constant, but that's ok */
+   signed pressure = 0;
+   signed orig_max_pressure = 0;
+
+   u_sparse_bitset_free(&s->live);
+   u_sparse_bitset_dup_with_ctx(&s->live, &block->live_out, memctx);
+
+   jay_foreach_inst_in_block_rev(block, I) {
+      if (jay_op_starts_block(I->op)) {
+         break;
+      } else if (jay_op_ends_block(I->op)) {
+         continue;
+      }
+
+      unsigned counter = 0;
+
+      /* Filter duplicates as we go */
+      BITSET_ZERO(I->last_use);
+
+      jay_foreach_src_index(I, _, c, index) {
+         if (!BITSET_TEST(s->seen, index)) {
+            BITSET_SET(I->last_use, counter);
+         }
+
+         BITSET_SET(s->seen, index);
+         counter++;
+      }
+
+      jay_foreach_src_index(I, _, c, index) {
+         BITSET_CLEAR(s->seen, index);
+      }
+
+      pressure -= calculate_pressure_delta_after(s, I);
+      orig_max_pressure = MAX2(pressure, orig_max_pressure);
+      pressure -= calculate_pressure_delta_before(s, I);
+      liveness_update(&s->live, I);
+   }
+
+   u_sparse_bitset_free(&s->live);
+   u_sparse_bitset_dup_with_ctx(&s->live, &block->live_out, memctx);
+
+   signed max_pressure = 0;
+   pressure = 0;
+
+   while (s->dag.heads.size) {
+      uint32_t node = choose_inst(s);
+      pressure -= calculate_pressure_delta_after(s, s->insts[node]);
+      max_pressure = MAX2(pressure, max_pressure);
+      pressure -= calculate_pressure_delta_before(s, s->insts[node]);
+      jay_dag_prune_head(&s->dag, node);
+
+      util_dynarray_append(schedule, node);
+      liveness_update(&s->live, s->insts[node]);
+   }
+
+   /* Apply the schedule only if it reduces pressure */
+   if (max_pressure < orig_max_pressure) {
+      util_dynarray_foreach(schedule, uint32_t, node) {
+         jay_remove_instruction(s->insts[*node]);
+      }
+
+      jay_builder b = jay_init_builder(func, jay_before_block(block));
+      util_dynarray_foreach_reverse(schedule, uint32_t, node) {
+         jay_builder_insert(&b, s->insts[*node]);
+      }
+   }
+}
+
+static void
+pass(jay_function *f)
+{
+   jay_compute_liveness(f);
+   jay_calculate_register_demands(f);
+
+   void *memctx = ralloc_context(NULL);
+   void *linctx = linear_context(memctx);
+   struct util_dynarray schedule = UTIL_DYNARRAY_INIT;
+
+   uint32_t nr_inst = 1;
+   jay_foreach_inst_in_func(f, _, I) {
+      ++nr_inst;
+   }
+
+   BITSET_WORD *seen = BITSET_LINEAR_ZALLOC(linctx, f->ssa_alloc);
+   struct sched_ctx sctx = { .seen = seen,
+                             .dispatch_width = f->shader->dispatch_width };
+   uint32_t *def = linear_zalloc_array(linctx, uint32_t, f->ssa_alloc);
+   sctx.insts = linear_alloc_array(linctx, jay_inst *, nr_inst);
+   jay_dag_init(&sctx.dag, memctx, nr_inst);
+
+   unsigned ugpr_per_grf = jay_ugpr_per_grf(f->shader);
+   unsigned ugpr_per_gpr = jay_grf_per_gpr(f->shader) * ugpr_per_grf;
+
+   jay_foreach_block(f, block) {
+      /* Treat flags as GPR demand conservatively since they spill to GPRs */
+      unsigned demand_ugpr = block->demand_max[UGPR];
+      unsigned demand_gpr = block->demand_max[GPR] +
+                            block->demand_max[FLAG] +
+                            block->demand_max[UFLAG];
+
+      /* Schedule for pressure only blocks that might spill, to minimize harm
+       * done to ILP and such. We conservatively use 104 GRFs as the threshold
+       * instead of 128 to leave wiggle room for flag RA and late lowerings.
+       */
+      if (((demand_gpr * ugpr_per_gpr) + demand_ugpr) >= (104 * ugpr_per_grf)) {
+         util_dynarray_clear(&schedule);
+
+         populate_dag(&sctx, f, block, def);
+         pressure_schedule_block(f, block, &schedule, &sctx, memctx);
+      }
+   }
+
+   util_dynarray_fini(&schedule);
+   ralloc_free(memctx);
+}
+
+JAY_DEFINE_FUNCTION_PASS(jay_schedule_pressure, pass)
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@ -49,6 +49,7 @@ libintel_compiler_jay_files = files(
  'jay.h',
  'jay_assign_accumulators.c',
  'jay_assign_flags.c',
+  'jay_dag.c',
  'jay_from_nir.c',
  'jay_ir.h',
  'jay_insert_fp_mode.c',
@ -67,6 +68,7 @@ libintel_compiler_jay_files = files(
  'jay_repair_ssa.c',
  'jay_register_allocate.c',
  'jay_simd_width.c',
+  'jay_schedule.c',
  'jay_spill.c',
  'jay_to_binary.c',
  'jay_validate.c',