nak: Add a simple postpass instruction scheduler

To get us started, this is designed to be pretty much the simplest thing
possible. It runs post-RA so we don't need to worry about hurting
occupancy and it uses the classic textbook algorithm for local (single
block) scheduling with the usual latency-weighted-depth heuristic.

-14.22% static cycle count on shaderdb

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32311>
This commit is contained in:
Mel Henning 2024-11-18 12:26:51 -05:00 committed by Marge Bot
parent d06d76a0d4
commit 79d0f8263d
4 changed files with 665 additions and 0 deletions

View file

@ -439,6 +439,7 @@ fn nak_compile_shader_internal(
s.remove_annotations();
pass!(s, opt_instr_sched_postpass);
pass!(s, calc_instr_deps);
s.gather_info();

View file

@ -16,6 +16,8 @@ mod opt_bar_prop;
mod opt_copy_prop;
mod opt_crs;
mod opt_dce;
mod opt_instr_sched_common;
mod opt_instr_sched_postpass;
mod opt_jump_thread;
mod opt_lop;
mod opt_out;

View file

@ -0,0 +1,413 @@
// Copyright © 2024 Valve Corporation
// SPDX-License-Identifier: MIT
use crate::ir::*;
use std::cmp::max;
use std::cmp::Reverse;
pub mod graph {
#[derive(Clone)]
pub struct Edge<EdgeLabel> {
pub label: EdgeLabel,
pub head_idx: usize,
}
#[derive(Clone)]
pub struct Node<NodeLabel, EdgeLabel> {
pub label: NodeLabel,
pub outgoing_edges: Vec<Edge<EdgeLabel>>,
}
#[derive(Clone)]
pub struct Graph<NodeLabel, EdgeLabel> {
pub nodes: Vec<Node<NodeLabel, EdgeLabel>>,
}
impl<NodeLabel, EdgeLabel> Graph<NodeLabel, EdgeLabel> {
pub fn new(node_labels: impl Iterator<Item = NodeLabel>) -> Self {
let nodes = node_labels
.map(|label| Node {
label,
outgoing_edges: Vec::new(),
})
.collect();
Graph { nodes }
}
pub fn add_edge(
&mut self,
tail_idx: usize,
head_idx: usize,
label: EdgeLabel,
) {
assert!(head_idx < self.nodes.len());
self.nodes[tail_idx]
.outgoing_edges
.push(Edge { label, head_idx });
}
pub fn reverse(&mut self) {
let old_edges: Vec<_> = self
.nodes
.iter_mut()
.map(|node| std::mem::take(&mut node.outgoing_edges))
.collect();
for (tail_idx, edges) in old_edges.into_iter().enumerate() {
for e in edges.into_iter() {
self.add_edge(e.head_idx, tail_idx, e.label);
}
}
}
}
}
#[derive(Eq, PartialEq)]
pub enum SideEffect {
/// No side effect (ALU-like)
None,
/// Instruction reads or writes memory
///
/// This will be serialized with respect to other
/// SideEffect::Memory instructions
Memory,
/// This instcuction is a full code motion barrier
///
/// No other instruction will be re-ordered with respect to this one
Barrier,
}
pub fn side_effect_type(op: &Op) -> SideEffect {
match op {
// Float ALU
Op::F2FP(_)
| Op::FAdd(_)
| Op::FFma(_)
| Op::FMnMx(_)
| Op::FMul(_)
| Op::FSet(_)
| Op::FSetP(_)
| Op::HAdd2(_)
| Op::HFma2(_)
| Op::HMul2(_)
| Op::HSet2(_)
| Op::HSetP2(_)
| Op::HMnMx2(_)
| Op::FSwzAdd(_) => SideEffect::None,
// Multi-function unit
Op::Rro(_) | Op::MuFu(_) => SideEffect::None,
// Double-precision float ALU
Op::DAdd(_)
| Op::DFma(_)
| Op::DMnMx(_)
| Op::DMul(_)
| Op::DSetP(_) => SideEffect::None,
// Integer ALU
Op::BRev(_)
| Op::Flo(_)
| Op::PopC(_)
| Op::IMad(_)
| Op::IMul(_)
| Op::BMsk(_)
| Op::IAbs(_)
| Op::IAdd2(_)
| Op::IAdd2X(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IDp4(_)
| Op::IMad64(_)
| Op::IMnMx(_)
| Op::ISetP(_)
| Op::Lea(_)
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
| Op::Bfe(_) => SideEffect::None,
// Conversions
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => {
SideEffect::None
}
// Move ops
Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => SideEffect::None,
Op::Shfl(_) => SideEffect::None,
// Predicate ops
Op::PLop3(_) | Op::PSetP(_) => SideEffect::None,
// Uniform ops
Op::R2UR(_) => SideEffect::None,
// Texture ops
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => SideEffect::Memory,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory,
// Memory ops
Op::Ipa(_) | Op::Ldc(_) => SideEffect::None,
Op::Ld(_)
| Op::St(_)
| Op::Atom(_)
| Op::AL2P(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::CCtl(_)
| Op::LdTram(_)
| Op::MemBar(_) => SideEffect::Memory,
// Control-flow ops
Op::BClear(_)
| Op::Break(_)
| Op::BSSy(_)
| Op::BSync(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Bra(_)
| Op::Exit(_)
| Op::WarpSync(_) => SideEffect::Barrier,
// We don't model the barrier register yet, so serialize these
Op::BMov(_) => SideEffect::Memory,
// Geometry ops
Op::Out(_) | Op::OutFinal(_) => SideEffect::Barrier,
// Miscellaneous ops
Op::Bar(_) | Op::CS2R(_) | Op::Isberd(_) | Op::Kill(_) | Op::S2R(_) => {
SideEffect::Barrier
}
Op::PixLd(_) | Op::Nop(_) | Op::Vote(_) => SideEffect::None,
// Virtual ops
Op::Annotate(_)
| Op::ParCopy(_)
| Op::Swap(_)
| Op::Copy(_)
| Op::Undef(_) => SideEffect::None,
Op::SrcBar(_)
| Op::Pin(_)
| Op::Unpin(_)
| Op::PhiSrcs(_)
| Op::PhiDsts(_)
| Op::RegOut(_) => SideEffect::Barrier,
}
}
/// Try to guess how many cycles a variable latency instruction will take
///
/// These values are based on the cycle estimates from "Dissecting the NVidia
/// Turing T4 GPU via Microbenchmarking" https://arxiv.org/pdf/1903.07486
/// Memory instructions were copied from L1 data cache latencies.
/// For instructions not mentioned in the paper, I made up numbers.
/// This could probably be improved.
pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 {
match op {
// Multi-function unit
Op::Rro(_) | Op::MuFu(_) => 15,
// Double-precision float ALU
Op::DFma(_) | Op::DSetP(_) => 54,
Op::DAdd(_) | Op::DMnMx(_) | Op::DMul(_) => 48,
// Integer ALU
Op::BRev(_) | Op::Flo(_) | Op::PopC(_) => 15,
Op::IMad(_) | Op::IMul(_) => {
assert!(sm < 70);
86
}
// Conversions
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => 15,
// Move ops
Op::Shfl(_) => 15,
// Uniform ops
Op::R2UR(_) => 15,
// Texture ops
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => 32,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32,
// Memory ops
Op::Ldc(_) => 4,
Op::Ld(_)
| Op::St(_)
| Op::Atom(_)
| Op::AL2P(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::Ipa(_)
| Op::CCtl(_)
| Op::LdTram(_)
| Op::MemBar(_) => 32,
// Control-flow ops
Op::WarpSync(_) => 16,
// Barrier
Op::BMov(_) => 16,
// Geometry ops
Op::Out(_) | Op::OutFinal(_) => 2,
// Miscellaneous ops
Op::Bar(_)
| Op::CS2R(_)
| Op::Isberd(_)
| Op::Kill(_)
| Op::PixLd(_)
| Op::S2R(_) => 16,
_ => panic!("Unknown variable latency op {op}"),
}
}
#[derive(Default, Clone)]
pub struct NodeLabel {
pub cycles_to_end: u32,
pub num_uses: u32,
/// The first cycle that the instruction can begin executing
pub ready_cycle: u32,
pub exec_latency: u32,
}
pub struct EdgeLabel {
pub latency: u32,
}
pub type DepGraph = graph::Graph<NodeLabel, EdgeLabel>;
pub fn calc_statistics(g: &mut DepGraph) -> Vec<usize> {
let mut initial_ready_list = Vec::new();
for i in (0..g.nodes.len()).rev() {
let node = &g.nodes[i];
let mut max_delay = 0;
for edge in &node.outgoing_edges {
assert!(edge.head_idx > i);
max_delay = max(
max_delay,
g.nodes[edge.head_idx].label.cycles_to_end + edge.label.latency,
);
}
let node = &mut g.nodes[i];
node.label.cycles_to_end = max_delay;
node.label.num_uses = node.outgoing_edges.len().try_into().unwrap();
if node.label.num_uses == 0 {
initial_ready_list.push(i);
}
}
return initial_ready_list;
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct ReadyInstr {
cycles_to_end: u32,
// We use the original instruction order as a final tie-breaker, the idea
// being that the original schedule is often not too bad. Since we're
// iterating in reverse order, that means scheduling the largest instruciton
// index first.
pub index: usize,
}
impl ReadyInstr {
pub fn new<E>(g: &graph::Graph<NodeLabel, E>, i: usize) -> Self {
let label = &g.nodes[i].label;
// NodeLabel::cycles_to_end is cycles from the beginning of the
// instruction to the top of the block, but we want from the end of the
// instruction to the top of the block
let cycles_to_end = label.cycles_to_end + label.exec_latency;
ReadyInstr {
cycles_to_end,
index: i,
}
}
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct FutureReadyInstr {
/// The first cycle that the instruction can end executing
pub ready_cycle: Reverse<u32>,
pub index: usize,
}
impl FutureReadyInstr {
pub fn new<E>(g: &graph::Graph<NodeLabel, E>, i: usize) -> Self {
let label = &g.nodes[i].label;
// NodeLabel::ready_cycle is the earliest beginning cycle for the
// instruction, but we need the earliest end cycle for the instruction.
let ready_cycle = label.ready_cycle.saturating_sub(label.exec_latency);
FutureReadyInstr {
ready_cycle: Reverse(ready_cycle),
index: i,
}
}
}
#[allow(dead_code)]
pub fn save_graphviz(
instrs: &[Box<Instr>],
g: &DepGraph,
) -> std::io::Result<()> {
// dot /tmp/instr_dep_graph.dot -Tsvg > /tmp/instr_dep_graph.svg
use std::fs::File;
use std::io::{BufWriter, Write};
let file = File::create("/tmp/instr_dep_graph.dot")?;
let mut w = BufWriter::new(file);
writeln!(w, "digraph {{")?;
for (i, instr) in instrs.iter().enumerate() {
let l = &g.nodes[i].label;
writeln!(
w,
" {i} [label=\"{}\\n{}, {}\"];",
instr, l.cycles_to_end, l.num_uses
)?;
}
for (i, node) in g.nodes.iter().enumerate() {
for j in &node.outgoing_edges {
writeln!(
w,
" {i} -> {} [label=\"{}\"];",
j.head_idx, j.label.latency
)?;
}
}
writeln!(w, "}}")?;
w.flush()?;
Ok(())
}

View file

@ -0,0 +1,249 @@
// Copyright © 2024 Valve Corporation
// SPDX-License-Identifier: MIT
use crate::ir::*;
use crate::opt_instr_sched_common::*;
use crate::sched_common::{
exec_latency, instr_latency, paw_latency, raw_latency, war_latency,
waw_latency, RegTracker,
};
use std::cmp::max;
use std::cmp::Reverse;
use std::collections::BinaryHeap;
struct RegUse<T: Clone> {
reads: Vec<T>,
write: Option<T>,
}
impl<T: Clone> RegUse<T> {
pub fn new() -> Self {
RegUse {
reads: Vec::new(),
write: None,
}
}
pub fn add_read(&mut self, dep: T) {
self.reads.push(dep);
}
pub fn set_write(&mut self, dep: T) {
self.write = Some(dep);
self.reads.clear();
}
}
fn generate_dep_graph(
sm: &dyn ShaderModel,
instrs: &Vec<Box<Instr>>,
) -> DepGraph {
let mut g = DepGraph::new((0..instrs.len()).map(|_| Default::default()));
// Maps registers to RegUse<ip, src_dst_idx>. Predicates are
// represented by src_idx = usize::MAX.
let mut uses: Box<RegTracker<RegUse<(usize, usize)>>> =
Box::new(RegTracker::new_with(&|| RegUse::new()));
let mut last_memory_op = None;
let mut last_barrier_op = None;
for ip in (0..instrs.len()).rev() {
let instr = &instrs[ip];
if let Some(bar_ip) = last_barrier_op {
g.add_edge(ip, bar_ip, EdgeLabel { latency: 0 });
}
match side_effect_type(&instr.op) {
SideEffect::None => (),
SideEffect::Barrier => {
let last_ip = last_barrier_op.unwrap_or(instrs.len());
for other_ip in (ip + 1)..last_ip {
g.add_edge(ip, other_ip, EdgeLabel { latency: 0 });
}
last_barrier_op = Some(ip);
}
SideEffect::Memory => {
if let Some(mem_ip) = last_memory_op {
g.add_edge(ip, mem_ip, EdgeLabel { latency: 0 });
}
last_memory_op = Some(ip);
}
}
uses.for_each_instr_dst_mut(instr, |i, u| {
if let Some((w_ip, w_dst_idx)) = u.write {
let latency = waw_latency(
sm.sm(),
&instr.op,
i,
&instrs[w_ip].op,
w_dst_idx,
);
g.add_edge(ip, w_ip, EdgeLabel { latency });
}
for &(r_ip, r_src_idx) in &u.reads {
let mut latency = if r_src_idx == usize::MAX {
paw_latency(sm.sm(), &instr.op, i)
} else {
raw_latency(
sm.sm(),
&instr.op,
i,
&instrs[r_ip].op,
r_src_idx,
)
};
if !instr.has_fixed_latency(sm.sm()) {
latency = max(
latency,
estimate_variable_latency(sm.sm(), &instr.op),
);
}
g.add_edge(ip, r_ip, EdgeLabel { latency });
}
});
uses.for_each_instr_src_mut(instr, |i, u| {
if let Some((w_ip, w_dst_idx)) = u.write {
let latency = war_latency(
sm.sm(),
&instr.op,
i,
&instrs[w_ip].op,
w_dst_idx,
);
g.add_edge(ip, w_ip, EdgeLabel { latency });
}
});
// We're iterating in reverse, so writes are logically first
uses.for_each_instr_dst_mut(instr, |i, c| {
c.set_write((ip, i));
});
uses.for_each_instr_pred_mut(instr, |c| {
c.add_read((ip, usize::MAX));
});
uses.for_each_instr_src_mut(instr, |i, c| {
c.add_read((ip, i));
});
// Initialize this node's distance to the end
let mut ready_cycle = (0..instr.dsts().len())
.map(|i| instr_latency(sm.sm(), &instr.op, i))
.max()
.unwrap_or(0);
if !instr.has_fixed_latency(sm.sm()) {
let var_latency = estimate_variable_latency(sm.sm(), &instr.op)
+ exec_latency(sm.sm(), &instrs[instrs.len() - 1].op);
ready_cycle = max(ready_cycle, var_latency);
}
let label = &mut g.nodes[ip].label;
label.exec_latency = exec_latency(sm.sm(), &instr.op);
label.ready_cycle = ready_cycle;
}
g
}
fn generate_order(g: &mut DepGraph, init_ready_list: Vec<usize>) -> Vec<usize> {
let mut ready_instrs: BinaryHeap<ReadyInstr> = BinaryHeap::new();
let mut future_ready_instrs: BinaryHeap<FutureReadyInstr> = init_ready_list
.into_iter()
.map(|i| FutureReadyInstr::new(g, i))
.collect();
let mut current_cycle = 0;
let mut instr_order = Vec::with_capacity(g.nodes.len());
loop {
// Move ready instructions to the ready list
loop {
let Some(fri) = future_ready_instrs.peek() else {
break;
};
if current_cycle < fri.ready_cycle.0 {
break;
}
ready_instrs.push(ReadyInstr::new(g, fri.index));
future_ready_instrs.pop();
}
// Pick a ready instruction
let next_idx = match ready_instrs.pop() {
None => match future_ready_instrs.peek() {
None => break, // Both lists are empty. We're done!
Some(&FutureReadyInstr {
ready_cycle: Reverse(ready_cycle),
..
}) => {
// Fast-forward time to when the next instr is ready
assert!(ready_cycle > current_cycle);
current_cycle = ready_cycle;
continue;
}
},
Some(ReadyInstr { index, .. }) => index,
};
// Schedule the instuction
instr_order.push(next_idx);
current_cycle += g.nodes[next_idx].label.exec_latency;
let outgoing_edges =
std::mem::take(&mut g.nodes[next_idx].outgoing_edges);
for edge in outgoing_edges.into_iter() {
let dep_instr = &mut g.nodes[edge.head_idx].label;
dep_instr.ready_cycle =
max(dep_instr.ready_cycle, current_cycle + edge.label.latency);
dep_instr.num_uses -= 1;
if dep_instr.num_uses == 0 {
future_ready_instrs
.push(FutureReadyInstr::new(g, edge.head_idx));
}
}
}
return instr_order;
}
fn sched_buffer(
sm: &dyn ShaderModel,
instrs: Vec<Box<Instr>>,
) -> impl Iterator<Item = Box<Instr>> {
let mut g = generate_dep_graph(sm, &instrs);
let init_ready_list = calc_statistics(&mut g);
// save_graphviz(&instrs, &g).unwrap();
g.reverse();
let new_order = generate_order(&mut g, init_ready_list);
// Apply the new instruction order
let mut instrs: Vec<Option<Box<Instr>>> =
instrs.into_iter().map(|instr| Some(instr)).collect();
new_order.into_iter().rev().map(move |i| {
std::mem::take(&mut instrs[i]).expect("Instruction scheduled twice")
})
}
impl Function {
pub fn opt_instr_sched_postpass(&mut self, sm: &dyn ShaderModel) {
for block in &mut self.blocks {
let orig_instr_count = block.instrs.len();
let instrs = std::mem::take(&mut block.instrs);
block.instrs = sched_buffer(sm, instrs).collect();
assert_eq!(orig_instr_count, block.instrs.len());
}
}
}
impl Shader<'_> {
/// Post-RA instruction scheduling
///
/// Uses the popular latency-weighted-depth heuristic.
/// See eg. Cooper & Torczon's "Engineering A Compiler", 3rd ed.
/// Chapter 12.3 "Local scheduling"
pub fn opt_instr_sched_postpass(&mut self) {
for f in &mut self.functions {
f.opt_instr_sched_postpass(self.sm);
}
}
}