jay: schedule for pressure

Implement a simple pre-RA bottom-up list scheduler with the goal of decreasing
register pressure. On Xe2, this significantly reduces spilling.

SSA form allows us to estimate register demand cheaply and accurately, which
theoretically [1] gives this algorithm the two Hippocratic properties:

1. Shaders with low register pressure are unaffected.
2. Register pressure can only be decreased, never increased.

In other words: first, do no harm.

The heuristic itself is very simple: greedily choose instructions that decrease
liveness using a backwards list scheduler. This is far from optimal! But thanks
to the above properties, even a heuristic that picked random instructions would
be a win overall - by construction, we can only ever win.

In other words: this scheduler is your older brother powering off the game
console any time he's about to lose a game, maintaining a 100% win rate.

[1] In reality, neither property is strictly satisfied due to the messy details
of mapping our clean logical model onto Intel's many weird physical register
files. Nevertheless, the algorithm is well-motivated and the empirical results
on Xe2 are excellent.

SIMD16:

   Totals:
   Instrs: 2754194 -> 2753957 (-0.01%); split: -0.23%, +0.22%
   CodeSize: 41094768 -> 41092768 (-0.00%); split: -0.23%, +0.23%
   Number of spill instructions: 1724 -> 1129 (-34.51%)
   Number of fill instructions: 1912 -> 1119 (-41.47%)

   Totals from 168 (6.35% of 2647) affected shaders:
   Instrs: 850994 -> 850757 (-0.03%); split: -0.75%, +0.73%
   CodeSize: 12825680 -> 12823680 (-0.02%); split: -0.74%, +0.73%
   Number of spill instructions: 1724 -> 1129 (-34.51%)
   Number of fill instructions: 1912 -> 1119 (-41.47%)

SIMD32:

   Totals:
   Instrs: 4688858 -> 4557800 (-2.80%); split: -3.53%, +0.74%
   CodeSize: 70177200 -> 68214816 (-2.80%); split: -3.53%, +0.74%
   Number of spill instructions: 50316 -> 45795 (-8.99%); split: -9.56%, +0.57%
   Number of fill instructions: 51526 -> 45075 (-12.52%); split: -13.23%, +0.71%

   Totals from 819 (30.94% of 2647) affected shaders:
   Instrs: 3810182 -> 3679124 (-3.44%); split: -4.35%, +0.91%
   CodeSize: 57044000 -> 55081616 (-3.44%); split: -4.35%, +0.91%
   Number of spill instructions: 49264 -> 44743 (-9.18%); split: -9.76%, +0.58%
   Number of fill instructions: 50182 -> 43731 (-12.86%); split: -13.58%, +0.73%

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41688>
This commit is contained in:
Alyssa Rosenzweig 2026-05-13 13:23:08 -04:00 committed by Marge Bot
parent 81e21a8756
commit bc22a37d98
8 changed files with 462 additions and 0 deletions

View file

@ -0,0 +1,85 @@
/*
* Copyright 2026 Intel Corporation
* Copyright 2019 Broadcom
* SPDX-License-Identifier: MIT
*/
#include "jay_dag.h"
#include <stdint.h>
#include "util/ralloc.h"
#include "util/u_dynarray.h"
void
jay_dag_init(struct jay_dag *dag, void *memctx, uint32_t node_count)
{
assert(node_count >= 1 && "node 0 is reserved and always present");
*dag = (struct jay_dag) {
.adjacency = rzalloc_array(memctx, uint32_t, node_count),
.parent_counts = rzalloc_array(memctx, uint32_t, node_count),
.node_count = node_count,
.node = 1,
};
util_dynarray_init(&dag->heads, memctx);
util_dynarray_init(&dag->edges, memctx);
}
void
jay_dag_add_edge(struct jay_dag *dag, uint32_t child)
{
if (child && child != dag->node) {
assert(child < dag->node_count);
/* We have to prune degenerate or duplicate edges */
for (uint32_t i = (dag->node > 0 ? dag->adjacency[dag->node - 1] : 0);
i < util_dynarray_num_elements(&dag->edges, uint32_t); ++i) {
if (*util_dynarray_element(&dag->edges, uint32_t, i) == child)
return;
}
util_dynarray_append(&dag->edges, child);
dag->parent_counts[child]++;
}
}
void
jay_dag_next_node(struct jay_dag *dag)
{
assert(dag->node < dag->node_count);
dag->adjacency[dag->node++] =
util_dynarray_num_elements(&dag->edges, uint32_t);
}
void
jay_dag_finalize(struct jay_dag *dag, uint32_t first_node)
{
for (uint32_t i = dag->node - 1; i >= first_node; --i) {
if (dag->parent_counts[i] == 0) {
util_dynarray_append(&dag->heads, i);
}
}
}
/**
* Removes a DAG head from the graph, and moves any new dag heads into the
* heads list.
*/
void
jay_dag_prune_head(struct jay_dag *dag, uint32_t head)
{
assert(!dag->parent_counts[head]);
util_dynarray_delete_unordered(&dag->heads, uint32_t, head);
uint32_t first = head > 0 ? dag->adjacency[head - 1] : 0;
for (unsigned i = first; i < dag->adjacency[head]; ++i) {
uint32_t *it = util_dynarray_element(&dag->edges, uint32_t, i);
if ((--dag->parent_counts[*it]) == 0) {
util_dynarray_append(&dag->heads, *it);
}
}
}

View file

@ -0,0 +1,22 @@
/*
* Copyright 2026 Intel Corporation
* Copyright 2019 Broadcom
* SPDX-License-Identifier: MIT
*/
#pragma once
#include "util/u_dynarray.h"
struct jay_dag {
struct util_dynarray heads, edges;
uint32_t *parent_counts;
uint32_t *adjacency;
uint32_t node, node_count;
};
void jay_dag_init(struct jay_dag *dag, void *memctx, uint32_t node_count);
void jay_dag_prune_head(struct jay_dag *dag, uint32_t head);
void jay_dag_add_edge(struct jay_dag *dag, uint32_t child);
void jay_dag_finalize(struct jay_dag *dag, uint32_t first_node);
void jay_dag_next_node(struct jay_dag *dag);

View file

@ -41,6 +41,7 @@ static const struct debug_named_value jay_debug_options[] = {
{ "spill", JAY_DBG_SPILL, "Shrink register file to test spilling" },
{ "sync", JAY_DBG_SYNC, "Sync after every instruction" },
{ "noacc", JAY_DBG_NOACC, "Disable accumulator substitution" },
{ "nosched", JAY_DBG_NOSCHED, "Disable scheduling" },
DEBUG_NAMED_VALUE_END
};
@ -2705,6 +2706,10 @@ jay_compile(const struct intel_device_info *devinfo,
jay_print(stdout, s);
}
if (!(jay_debug & JAY_DBG_NOSCHED)) {
JAY_PASS(s, jay_schedule_pressure);
}
JAY_PASS(s, jay_assign_flags);
if (!(jay_debug & JAY_DBG_NOOPT)) {
JAY_PASS(s, jay_opt_dead_code);

View file

@ -1098,6 +1098,10 @@ typedef struct jay_block {
/** Pretty printing based on original structured control flow */
uint8_t indent;
/* Register demand metadata calculated for scheduling use */
unsigned demand_max[JAY_NUM_SSA_FILES];
unsigned demand_out[JAY_NUM_SSA_FILES];
} jay_block;
static inline jay_block *

View file

@ -211,6 +211,11 @@ jay_calculate_register_demands(jay_function *func)
jay_print_inst(stdout, I);
}
}
jay_foreach_ssa_file(f) {
block->demand_max[f] = max_demand[f];
block->demand_out[f] = demands[f];
}
}
free(files);

View file

@ -17,6 +17,7 @@ extern "C" {
#define JAY_DBG_SPILL BITFIELD_BIT(2)
#define JAY_DBG_SYNC BITFIELD_BIT(3)
#define JAY_DBG_NOACC BITFIELD_BIT(4)
#define JAY_DBG_NOSCHED BITFIELD_BIT(5)
extern int jay_debug;
bool jay_nir_lower_bool(nir_shader *nir);
@ -70,6 +71,8 @@ void jay_opt_propagate_backwards(jay_shader *s);
void jay_opt_dead_code(jay_shader *s);
void jay_opt_predicate(jay_shader *s);
void jay_schedule_pressure(jay_shader *s);
void jay_lower_pre_ra(jay_shader *s);
void jay_lower_post_ra(jay_shader *s);
void jay_lower_spill(jay_function *func);

View file

@ -0,0 +1,336 @@
/*
* Copyright 2026 Intel Corporation
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2022 Collabora Ltd.
* Copyright 2019 Broadcom
* SPDX-License-Identifier: MIT
*/
/*
* This file implements a simple pre-RA bottom-up list scheduler with the goal
* of decreasing register pressure. On Xe2, this significantly reduces spilling.
*
* SSA form allows us to estimate register demand cheaply and accurately, which
* theoretically [1] gives this algorithm the two Hippocratic properties:
*
* 1. Shaders with low register pressure are unaffected.
* 2. Register pressure can only be decreased, never increased.
*
* In other words: first, do no harm.
*
* The heuristic itself is very simple: greedily choose instructions that
* decrease liveness using a backwards list scheduler. This is far from optimal!
* But thanks to the above properties, even a heuristic that picked random
* instructions would be a win overall - by construction, we can only ever win.
*
* [1] In reality, neither property is strictly satisfied due to the messy
* details of mapping our clean logical model onto Intel's many weird physical
* register files. Nevertheless, the algorithm is well-motivated and the
* empirical results on Xe2 are excellent.
*/
#include "util/bitset.h"
#include "util/ralloc.h"
#include "util/sparse_bitset.h"
#include "util/u_dynarray.h"
#include "jay_builder.h"
#include "jay_dag.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
struct sched_ctx {
struct jay_dag dag;
unsigned dispatch_width;
jay_inst **insts;
struct u_sparse_bitset live;
BITSET_WORD *seen;
};
/* Cut down version of the function in jay_liveness.c */
static void
liveness_update(struct u_sparse_bitset *live, jay_inst *I)
{
jay_foreach_dst_index(I, _, def) {
u_sparse_bitset_clear(live, def);
}
jay_foreach_src_index(I, _, comp, index) {
u_sparse_bitset_set(live, index);
}
}
static void
populate_dag(struct sched_ctx *ctx,
jay_function *func,
jay_block *block,
uint32_t *def)
{
uint32_t first_node_in_this_block = ctx->dag.node;
/* TODO: Reorder memory instructions */
uint32_t sidefx = 0, address = 0;
jay_foreach_inst_in_block(block, I) {
if (jay_op_starts_block(I->op)) {
continue;
} else if (jay_op_ends_block(I->op)) {
break;
}
/* Uses depend on definitions. SSA form forbids WaR and WaW hazards */
jay_foreach_src_index(I, s, c, index) {
if (def[index] && def[index] >= first_node_in_this_block) {
jay_dag_add_edge(&ctx->dag, def[index]);
}
}
jay_foreach_dst_index(I, d, index) {
def[index] = ctx->dag.node;
}
/* Serialize address register access until we have an address RA */
bool use_a0 = I->dst.file == J_ADDRESS || I->op == JAY_OPCODE_SHUFFLE;
jay_foreach_src(I, s) {
use_a0 |= I->src[s].file == J_ADDRESS;
}
if (use_a0) {
jay_dag_add_edge(&ctx->dag, address);
address = ctx->dag.node;
}
/* Serialize side effects for now */
if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) ||
I->op == JAY_OPCODE_SCHEDULE_BARRIER) {
jay_dag_add_edge(&ctx->dag, sidefx);
sidefx = ctx->dag.node;
}
ctx->insts[ctx->dag.node] = I;
jay_dag_next_node(&ctx->dag);
}
jay_dag_finalize(&ctx->dag, first_node_in_this_block);
}
/*
* Due to multiple register files, register demand is a vector. Our dynamic
* register file partitioning justifies modelling demand as a single scalar,
* where each file has a weight determined here.
*/
static unsigned
scale(struct sched_ctx *ctx, jay_def x)
{
return x.file == J_ADDRESS ? 0 : jay_is_uniform(x) ? 1 : ctx->dispatch_width;
}
/*
* Calculate the change in register pressure from scheduling a given
* instuction. Based on jay_calculate_register_demands, but without the use of
* kill-bits since we are reordering instructions.
*/
static signed
calculate_pressure_delta_before(struct sched_ctx *ctx, jay_inst *I)
{
signed delta = 0;
/* Make destinations live */
jay_foreach_dst(I, dst) {
delta += util_next_power_of_two(jay_num_values(dst)) * scale(ctx, dst);
}
return delta;
}
static signed
calculate_pressure_delta_after(struct sched_ctx *ctx, jay_inst *I)
{
signed delta = 0;
unsigned counter = 0;
/* Dead destinations are those written by the instruction but killed
* immediately after the instruction finishes.
*/
jay_foreach_dst_index(I, _, index) {
delta -= !u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->dst);
}
jay_foreach_dst(I, d) {
unsigned n = jay_num_values(d);
delta -= (util_next_power_of_two(n) - n) * scale(ctx, I->dst);
}
/* Late-kill sources. We precomputed the deduplication info and stashed it in
* the I->last_use bitfield for convenience.
*/
jay_foreach_src_index(I, s, c, index) {
if (BITSET_TEST(I->last_use, counter)) {
delta -=
!u_sparse_bitset_test(&ctx->live, index) * scale(ctx, I->src[s]);
}
counter++;
}
return delta;
}
/*
* Choose the next instuction, bottom-up. For now we use a simple greedy
* heuristic: choose the instuction that has the best effect on liveness.
*/
static uint32_t
choose_inst(struct sched_ctx *s)
{
int32_t min_delta = INT32_MAX;
uint32_t best = 0;
util_dynarray_foreach(&s->dag.heads, uint32_t, head) {
jay_inst *I = s->insts[*head];
int32_t delta = -(calculate_pressure_delta_after(s, I) +
calculate_pressure_delta_before(s, I));
/* As a tiebreaker (only), sink flag writes to reduce specifically flag
* pressure, because spilling flags costs extra instructions and GPR
* pressure. This is a mildly positive heuristic.
*/
delta *= 2;
if (jay_is_null(I->cond_flag)) {
delta++;
}
if (delta <= min_delta) {
best = *head;
min_delta = delta;
}
}
return best;
}
static void
pressure_schedule_block(jay_function *func,
jay_block *block,
struct util_dynarray *schedule,
struct sched_ctx *s,
void *memctx)
{
/* Our pressure calculations are all off by a constant, but that's ok */
signed pressure = 0;
signed orig_max_pressure = 0;
u_sparse_bitset_free(&s->live);
u_sparse_bitset_dup_with_ctx(&s->live, &block->live_out, memctx);
jay_foreach_inst_in_block_rev(block, I) {
if (jay_op_starts_block(I->op)) {
break;
} else if (jay_op_ends_block(I->op)) {
continue;
}
unsigned counter = 0;
/* Filter duplicates as we go */
BITSET_ZERO(I->last_use);
jay_foreach_src_index(I, _, c, index) {
if (!BITSET_TEST(s->seen, index)) {
BITSET_SET(I->last_use, counter);
}
BITSET_SET(s->seen, index);
counter++;
}
jay_foreach_src_index(I, _, c, index) {
BITSET_CLEAR(s->seen, index);
}
pressure -= calculate_pressure_delta_after(s, I);
orig_max_pressure = MAX2(pressure, orig_max_pressure);
pressure -= calculate_pressure_delta_before(s, I);
liveness_update(&s->live, I);
}
u_sparse_bitset_free(&s->live);
u_sparse_bitset_dup_with_ctx(&s->live, &block->live_out, memctx);
signed max_pressure = 0;
pressure = 0;
while (s->dag.heads.size) {
uint32_t node = choose_inst(s);
pressure -= calculate_pressure_delta_after(s, s->insts[node]);
max_pressure = MAX2(pressure, max_pressure);
pressure -= calculate_pressure_delta_before(s, s->insts[node]);
jay_dag_prune_head(&s->dag, node);
util_dynarray_append(schedule, node);
liveness_update(&s->live, s->insts[node]);
}
/* Apply the schedule only if it reduces pressure */
if (max_pressure < orig_max_pressure) {
util_dynarray_foreach(schedule, uint32_t, node) {
jay_remove_instruction(s->insts[*node]);
}
jay_builder b = jay_init_builder(func, jay_before_block(block));
util_dynarray_foreach_reverse(schedule, uint32_t, node) {
jay_builder_insert(&b, s->insts[*node]);
}
}
}
static void
pass(jay_function *f)
{
jay_compute_liveness(f);
jay_calculate_register_demands(f);
void *memctx = ralloc_context(NULL);
void *linctx = linear_context(memctx);
struct util_dynarray schedule = UTIL_DYNARRAY_INIT;
uint32_t nr_inst = 1;
jay_foreach_inst_in_func(f, _, I) {
++nr_inst;
}
BITSET_WORD *seen = BITSET_LINEAR_ZALLOC(linctx, f->ssa_alloc);
struct sched_ctx sctx = { .seen = seen,
.dispatch_width = f->shader->dispatch_width };
uint32_t *def = linear_zalloc_array(linctx, uint32_t, f->ssa_alloc);
sctx.insts = linear_alloc_array(linctx, jay_inst *, nr_inst);
jay_dag_init(&sctx.dag, memctx, nr_inst);
unsigned ugpr_per_grf = jay_ugpr_per_grf(f->shader);
unsigned ugpr_per_gpr = jay_grf_per_gpr(f->shader) * ugpr_per_grf;
jay_foreach_block(f, block) {
/* Treat flags as GPR demand conservatively since they spill to GPRs */
unsigned demand_ugpr = block->demand_max[UGPR];
unsigned demand_gpr = block->demand_max[GPR] +
block->demand_max[FLAG] +
block->demand_max[UFLAG];
/* Schedule for pressure only blocks that might spill, to minimize harm
* done to ILP and such. We conservatively use 104 GRFs as the threshold
* instead of 128 to leave wiggle room for flag RA and late lowerings.
*/
if (((demand_gpr * ugpr_per_gpr) + demand_ugpr) >= (104 * ugpr_per_grf)) {
util_dynarray_clear(&schedule);
populate_dag(&sctx, f, block, def);
pressure_schedule_block(f, block, &schedule, &sctx, memctx);
}
}
util_dynarray_fini(&schedule);
ralloc_free(memctx);
}
JAY_DEFINE_FUNCTION_PASS(jay_schedule_pressure, pass)

View file

@ -49,6 +49,7 @@ libintel_compiler_jay_files = files(
'jay.h',
'jay_assign_accumulators.c',
'jay_assign_flags.c',
'jay_dag.c',
'jay_from_nir.c',
'jay_ir.h',
'jay_insert_fp_mode.c',
@ -67,6 +68,7 @@ libintel_compiler_jay_files = files(
'jay_repair_ssa.c',
'jay_register_allocate.c',
'jay_simd_width.c',
'jay_schedule.c',
'jay_spill.c',
'jay_to_binary.c',
'jay_validate.c',