nak: Add cross-block instruction delay scheduling

Currently each block schedules instruction independently from other
blocks. Instructions in the block must then be scheduled conservatively
to remove possible hazards that can occur in previous blocks.

Replace the algorithm with an optimistic data-flow pass that takes into
account all the following blocks. Gains a minor performance improvement
across every shader (1-2%) and should never have any runtime performance
degradation.

Benchmarks:
furmark       34932 -> 35597 (+2%)
pixmark-piano 9027  ->  9113 (+1%)

Signed-off-by: Lorenzo Rossi <git@rossilorenzo.dev>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37108>
This commit is contained in:
Lorenzo Rossi 2025-10-30 13:42:32 +01:00 committed by Marge Bot
parent e6d4eaed2a
commit 7c39f69871
3 changed files with 377 additions and 124 deletions

View file

@ -4,13 +4,15 @@
use crate::api::{GetDebugFlags, DEBUG};
use crate::ir::*;
use crate::opt_instr_sched_common::estimate_block_weight;
use crate::reg_tracker::{RegRefIterable, RegTracker};
use crate::reg_tracker::{RegRefIterable, RegTracker, SparseRegTracker};
use compiler::dataflow::ForwardDataflow;
use compiler::dataflow::{BackwardDataflow, ForwardDataflow};
use rustc_hash::{FxHashMap, FxHashSet};
use std::cmp::max;
use std::hash::Hash;
use std::ops::Range;
use std::{slice, u8};
use std::slice;
use std::{u16, u32, u8};
#[derive(Clone)]
enum RegUse<T: Clone> {
@ -61,6 +63,79 @@ impl<T: Clone> RegUse<T> {
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
enum RegReadWrite {
Read,
Write,
}
/// Maps each register read/write to a value
/// a register can have multiple reads AND multiple writes at the same
/// point in time if it comes from a merge.
/// For edits inside a CFG block, a RegUseMap will always be either
/// empty, with a single write or with one or multiple reads.
///
/// We need to track multiple reads as we don't know which one can cause
/// the highest latency for the interfering instruction (in RaW). For the
/// same reason we might need to track both reads and writes in the case of
/// a CFG block with multiple successors.
#[derive(Clone, PartialEq, Eq, Default)]
struct RegUseMap<K: Hash + Eq, V> {
map: FxHashMap<(RegReadWrite, K), V>,
}
impl<K, V> RegUseMap<K, V>
where
K: Copy + Default + Hash + Eq,
V: Clone,
{
pub fn add_read(&mut self, k: K, v: V) {
// Reads wait on previous writes (RaR don't exist)
self.map.retain(|k, _v| k.0 != RegReadWrite::Write);
self.map.insert((RegReadWrite::Read, k), v);
}
pub fn set_write(&mut self, k: K, v: V) {
// Writes wait on all previous Reads and writes
self.map.clear();
self.map.insert((RegReadWrite::Write, k), v);
}
pub fn iter_reads(&self) -> impl Iterator<Item = (&K, &V)> {
self.map
.iter()
.filter(|(k, _v)| k.0 == RegReadWrite::Read)
.map(|(k, v)| (&k.1, v))
}
pub fn iter_writes(&self) -> impl Iterator<Item = (&K, &V)> {
self.map
.iter()
.filter(|(k, _v)| k.0 == RegReadWrite::Write)
.map(|(k, v)| (&k.1, v))
}
/// Merge two instances using a custom merger for value conflicts
pub fn merge_with(
&mut self,
other: &Self,
mut merger: impl FnMut(&V, &V) -> V,
) {
use std::collections::hash_map::Entry;
for (k, v) in other.map.iter() {
match self.map.entry(*k) {
Entry::Vacant(vacant_entry) => {
vacant_entry.insert(v.clone());
}
Entry::Occupied(mut occupied_entry) => {
let orig = occupied_entry.get_mut();
*orig = merger(&orig, v);
}
}
}
}
}
struct DepNode {
read_dep: Option<usize>,
first_wait: Option<(usize, usize)>,
@ -670,127 +745,288 @@ fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
}
}
fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u64 {
let mut min_num_static_cycles = 0;
for i in (0..f.blocks.len()).rev() {
let b = &mut f.blocks[i];
let mut cycle = 0_u32;
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
struct RegOrigin {
loc: InstrIdx,
// Index of the src (for reads) or dst (for writes) in the instruction.
src_dst_idx: u16,
}
// Vector mapping IP to start cycle
let mut instr_cycle = vec![0; b.instrs.len()];
impl Default for RegOrigin {
fn default() -> Self {
// Lower bound
Self {
loc: InstrIdx::new(0, 0),
src_dst_idx: 0,
}
}
}
// Maps registers to RegUse<ip, src_dst_idx>. Predicates are
// represented by src_idx = usize::MAX.
let mut uses: Box<RegTracker<RegUse<(usize, usize)>>> =
Box::new(RegTracker::new_with(&|| RegUse::None));
// Delay accumulated from the blocks it passed, used to check for cross-block hazards.
type AccumulatedDelay = u8;
type DelayRegTracker = SparseRegTracker<RegUseMap<RegOrigin, AccumulatedDelay>>;
// Map from barrier to last waited cycle
let mut bars = [0_u32; 6];
struct BlockDelayScheduler<'a> {
sm: &'a dyn ShaderModel,
f: &'a Function,
// Map from barrier to last waited cycle
bars: [u32; 6],
// Current cycle count until end-of-block.
current_cycle: u32,
// Map from idx (block, instr) to block-relative cycle
instr_cycles: &'a mut Vec<Vec<u32>>,
}
for ip in (0..b.instrs.len()).rev() {
let instr = &b.instrs[ip];
let mut min_start = cycle + sm.exec_latency(&instr.op);
if let Some(bar) = instr.deps.rd_bar() {
min_start = max(min_start, bars[usize::from(bar)] + 2);
}
if let Some(bar) = instr.deps.wr_bar() {
min_start = max(min_start, bars[usize::from(bar)] + 2);
}
uses.for_each_instr_dst_mut(instr, |i, u| match u {
RegUse::None => {
// We don't know how it will be used but it may be used in
// the next block so we need at least assume the maximum
// destination latency from the end of the block.
let s = sm.worst_latency(&instr.op, i);
min_start = max(min_start, s);
}
RegUse::Write((w_ip, w_dst_idx)) => {
let s = instr_cycle[*w_ip]
+ sm.waw_latency(
&instr.op,
i,
!instr.pred.pred_ref.is_none(),
&b.instrs[*w_ip].op,
*w_dst_idx,
);
min_start = max(min_start, s);
}
RegUse::Reads(reads) => {
for (r_ip, r_src_idx) in reads {
let c = instr_cycle[*r_ip];
let s = if *r_src_idx == usize::MAX {
c + sm.paw_latency(&instr.op, i)
} else {
c + sm.raw_latency(
&instr.op,
i,
&b.instrs[*r_ip].op,
*r_src_idx,
)
};
min_start = max(min_start, s);
}
}
});
uses.for_each_instr_src_mut(instr, |i, u| match u {
RegUse::None => (),
RegUse::Write((w_ip, w_dst_idx)) => {
let s = instr_cycle[*w_ip]
+ sm.war_latency(
&instr.op,
i,
&b.instrs[*w_ip].op,
*w_dst_idx,
);
min_start = max(min_start, s);
}
RegUse::Reads(_) => (),
});
impl BlockDelayScheduler<'_> {
/// Compute the starting cycle for an instruction to avoid a dependency hazard.
fn dependency_to_cycle(
&self,
curr_loc: InstrIdx, // Location of the current instruction
reg: &RegOrigin, // Register and location of instruction that will be executed later
delay: AccumulatedDelay, // Delay between the end of the current block and the later instruction
latency: u32, // Latency between current and later instruction
) -> u32 {
debug_assert!(latency <= self.sm.latency_upper_bound());
let instr = &mut b.instrs[ip];
let same_block = reg.loc.block_idx == curr_loc.block_idx
&& reg.loc.instr_idx > curr_loc.instr_idx;
let delay = min_start - cycle;
let delay = delay.max(MIN_INSTR_DELAY.into()).try_into().unwrap();
instr.deps.set_delay(delay);
if same_block {
// Created this transfer pass
self.instr_cycles[reg.loc.block_idx as usize]
[reg.loc.instr_idx as usize]
+ latency
} else {
// Remember that cycles are always counted from the end of a block.
// The next instruction happens after `delay` cycles after the
// current block is complete, so it is effectively executed at cycle
// `0 - delay`, adding the latency we get `latency - delay`
// Underflow means that the instruction is already done (delay > latency).
latency.checked_sub(delay.into()).unwrap_or(0)
}
}
instr_cycle[ip] = min_start;
fn process_instr(&mut self, loc: InstrIdx, reg_uses: &mut DelayRegTracker) {
let instr = &self.f[loc];
// Set the writes before adding the reads
// as we are iterating backwards through instructions.
uses.for_each_instr_dst_mut(instr, |i, c| {
c.set_write((ip, i));
});
uses.for_each_instr_pred_mut(instr, |c| {
c.add_read((ip, usize::MAX));
});
uses.for_each_instr_src_mut(instr, |i, c| {
c.add_read((ip, i));
});
// Kepler A membar conflicts with predicate writes
if sm.is_kepler_a() && matches!(&instr.op, Op::MemBar(_)) {
uses.for_each_pred(|c| {
c.add_read((ip, usize::MAX));
});
uses.for_each_carry(|c| {
c.add_read((ip, usize::MAX));
});
}
for (bar, c) in bars.iter_mut().enumerate() {
if instr.deps.wt_bar_mask & (1 << bar) != 0 {
*c = min_start;
}
}
let mut min_start =
self.current_cycle + self.sm.exec_latency(&instr.op);
cycle = min_start;
// Wait on rd/wr barriers
if let Some(bar) = instr.deps.rd_bar() {
min_start = max(min_start, self.bars[usize::from(bar)] + 2);
}
if let Some(bar) = instr.deps.wr_bar() {
min_start = max(min_start, self.bars[usize::from(bar)] + 2);
}
let block_weight = estimate_block_weight(&f.blocks, i);
min_num_static_cycles = u64::from(cycle)
.checked_mul(block_weight)
.expect("Cycle count estimate overflow")
.checked_add(min_num_static_cycles)
.expect("Cycle count estimate overflow");
reg_uses.for_each_instr_dst_mut(instr, |i, u| {
for (orig, delay) in u.iter_writes() {
let l = self.sm.waw_latency(
&instr.op,
i,
!instr.pred.pred_ref.is_none(),
&self.f[orig.loc].op,
orig.src_dst_idx as usize,
);
let s = self.dependency_to_cycle(loc, orig, *delay, l);
min_start = max(min_start, s);
}
for (orig, delay) in u.iter_reads() {
let l = if orig.src_dst_idx == u16::MAX {
self.sm.paw_latency(&instr.op, i)
} else {
self.sm.raw_latency(
&instr.op,
i,
&self.f[orig.loc].op,
orig.src_dst_idx as usize,
)
};
let s = self.dependency_to_cycle(loc, orig, *delay, l);
min_start = max(min_start, s);
}
u.set_write(
RegOrigin {
loc,
src_dst_idx: i as u16,
},
0,
);
});
reg_uses.for_each_instr_pred_mut(instr, |c| {
// WaP does not exist
c.add_read(
RegOrigin {
loc,
src_dst_idx: u16::MAX,
},
0,
);
});
reg_uses.for_each_instr_src_mut(instr, |i, u| {
for (orig, delay) in u.iter_writes() {
let l = self.sm.war_latency(
&instr.op,
i,
&self.f[orig.loc].op,
orig.src_dst_idx as usize,
);
let s = self.dependency_to_cycle(loc, orig, *delay, l);
min_start = max(min_start, s);
}
u.add_read(
RegOrigin {
loc,
src_dst_idx: i as u16,
},
0,
);
});
self.instr_cycles[loc.block_idx as usize][loc.instr_idx as usize] =
min_start;
// Kepler A membar conflicts with predicate writes
if self.sm.is_kepler_a() && matches!(&instr.op, Op::MemBar(_)) {
let read_origin = RegOrigin {
loc,
src_dst_idx: u16::MAX,
};
reg_uses.for_each_pred(|c| {
c.add_read(read_origin.clone(), 0);
});
reg_uses.for_each_carry(|c| {
c.add_read(read_origin.clone(), 0);
});
}
// "Issue" barriers other instructions will wait on.
for (bar, c) in self.bars.iter_mut().enumerate() {
if instr.deps.wt_bar_mask & (1 << bar) != 0 {
*c = min_start;
}
}
self.current_cycle = min_start;
}
}
fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u64 {
let mut instr_cycles: Vec<Vec<u32>> =
f.blocks.iter().map(|b| vec![0; b.instrs.len()]).collect();
let mut state_in: Vec<_> = vec![DelayRegTracker::default(); f.blocks.len()];
let mut state_out: Vec<_> =
vec![DelayRegTracker::default(); f.blocks.len()];
let latency_upper_bound: u8 = sm
.latency_upper_bound()
.try_into()
.expect("Latency upper bound too large!");
// Compute instruction delays using an optimistic backwards data-flow
// algorithm. For back-cycles we assume the best and recompute when
// new data is available. This is yields correct results as long as
// the data flow analysis is run until completion.
BackwardDataflow {
cfg: &f.blocks,
block_in: &mut state_in[..],
block_out: &mut state_out[..],
transfer: |block_idx, block, reg_in, reg_out| {
let mut uses = reg_out.clone();
let mut sched = BlockDelayScheduler {
sm,
f,
// Barriers are handled by `assign_barriers`, and it does
// not handle cross-block barrier signal/wait.
// We can safely assume that no barrier is active at the
// start and end of the block
bars: [0_u32; 6],
current_cycle: 0_u32,
instr_cycles: &mut instr_cycles,
};
for ip in (0..block.instrs.len()).rev() {
let loc = InstrIdx::new(block_idx, ip);
sched.process_instr(loc, &mut uses);
}
// Update accumulated delay
let block_cycles = sched.current_cycle;
uses.retain(|reg_use| {
reg_use.map.retain(|(_rw, k), v| {
let overcount = if k.loc.block_idx as usize == block_idx {
// Only instrs before instr_idx must be counted
instr_cycles[k.loc.block_idx as usize]
[k.loc.instr_idx as usize]
} else {
0
};
let instr_executed = (block_cycles - overcount)
.try_into()
.unwrap_or(u8::MAX);
// We only care about the accumulated delay until it
// is bigger than the maximum delay of an instruction.
// after that, it cannot cause hazards.
let (added, overflow) =
(*v).overflowing_add(instr_executed);
*v = added;
// Stop keeping track of entries that happened too
// many cycles "in the future", and cannot affect
// scheduling anymore
!overflow && added <= latency_upper_bound
});
!reg_use.map.is_empty()
});
if *reg_in == uses {
false
} else {
*reg_in = uses;
true
}
},
join: |curr_in, succ_out| {
// We start with an optimistic assumption and gradually make it
// less optimistic. So in the join operation we need to keep
// the "worst" accumulated latency, that is the lowest one.
// i.e. if an instruction has an accumulated latency of 2 cycles,
// it can interfere with the next block, while if it had 200 cycles
// it's highly unlikely that it could interfere.
curr_in.merge_with(succ_out, |a, b| {
a.merge_with(b, |ai, bi| (*ai).min(*bi))
});
},
}
.solve();
// Update the deps.delay for each instruction and compute
for (bi, b) in f.blocks.iter_mut().enumerate() {
let cycles = &instr_cycles[bi];
for (ip, i) in b.instrs.iter_mut().enumerate() {
let delay = cycles[ip] - cycles.get(ip + 1).copied().unwrap_or(0);
let delay: u8 = delay.try_into().expect("Delay overflow");
i.deps.delay = delay.max(MIN_INSTR_DELAY) as u8;
}
}
let min_num_static_cycles = instr_cycles
.iter()
.enumerate()
.map(|(block_idx, cycles)| {
let cycles = cycles.last().copied().unwrap_or(0);
let block_weight = estimate_block_weight(&f.blocks, block_idx);
u64::from(cycles)
.checked_mul(block_weight)
.expect("Cycle count estimate overflow")
})
.reduce(|a, b| a.checked_add(b).expect("Cycle count estimate overflow"))
.unwrap_or(0);
let max_instr_delay = sm.max_instr_delay();
f.map_instrs(|mut instr, _| {

View file

@ -8756,6 +8756,24 @@ impl BasicBlock {
}
}
/// Stores the index of an instruction in a given Function
///
/// The block and instruction indices are stored in a memory-efficient way.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct InstrIdx {
pub block_idx: u32,
pub instr_idx: u32,
}
impl InstrIdx {
pub fn new(bi: usize, ii: usize) -> Self {
Self {
block_idx: bi.try_into().expect("Block index overflow"),
instr_idx: ii.try_into().expect("Instruction index overflow"),
}
}
}
pub struct Function {
pub ssa_alloc: SSAValueAllocator,
pub phi_alloc: PhiAllocator,
@ -8847,6 +8865,17 @@ impl fmt::Display for Function {
}
}
impl Index<InstrIdx> for Function {
type Output = Instr;
fn index(&self, index: InstrIdx) -> &Self::Output {
// Removed at compile time (except for 16-bit targets)
let block_idx: usize = index.block_idx.try_into().unwrap();
let instr_idx: usize = index.instr_idx.try_into().unwrap();
&self.blocks[block_idx].instrs[instr_idx]
}
}
#[derive(Debug)]
pub struct ComputeShaderInfo {
pub local_size: [u16; 3],

View file

@ -34,18 +34,6 @@ impl<T> RegTracker<T> {
carry: new_array_with(f),
}
}
pub fn for_each_pred(&mut self, mut f: impl FnMut(&mut T)) {
for p in &mut self.pred[..] {
f(p);
}
}
pub fn for_each_carry(&mut self, mut f: impl FnMut(&mut T)) {
for c in &mut self.carry {
f(c);
}
}
}
impl<T> Index<RegRef> for RegTracker<T> {