mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 00:10:20 +01:00
nak: Add a simple postpass instruction scheduler
To get us started, this is designed to be pretty much the simplest thing possible. It runs post-RA so we don't need to worry about hurting occupancy and it uses the classic textbook algorithm for local (single block) scheduling with the usual latency-weighted-depth heuristic. -14.22% static cycle count on shaderdb Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32311>
This commit is contained in:
parent
d06d76a0d4
commit
79d0f8263d
4 changed files with 665 additions and 0 deletions
|
|
@ -439,6 +439,7 @@ fn nak_compile_shader_internal(
|
|||
|
||||
s.remove_annotations();
|
||||
|
||||
pass!(s, opt_instr_sched_postpass);
|
||||
pass!(s, calc_instr_deps);
|
||||
|
||||
s.gather_info();
|
||||
|
|
|
|||
|
|
@ -16,6 +16,8 @@ mod opt_bar_prop;
|
|||
mod opt_copy_prop;
|
||||
mod opt_crs;
|
||||
mod opt_dce;
|
||||
mod opt_instr_sched_common;
|
||||
mod opt_instr_sched_postpass;
|
||||
mod opt_jump_thread;
|
||||
mod opt_lop;
|
||||
mod opt_out;
|
||||
|
|
|
|||
413
src/nouveau/compiler/nak/opt_instr_sched_common.rs
Normal file
413
src/nouveau/compiler/nak/opt_instr_sched_common.rs
Normal file
|
|
@ -0,0 +1,413 @@
|
|||
// Copyright © 2024 Valve Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
use crate::ir::*;
|
||||
use std::cmp::max;
|
||||
use std::cmp::Reverse;
|
||||
|
||||
pub mod graph {
|
||||
#[derive(Clone)]
|
||||
pub struct Edge<EdgeLabel> {
|
||||
pub label: EdgeLabel,
|
||||
pub head_idx: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Node<NodeLabel, EdgeLabel> {
|
||||
pub label: NodeLabel,
|
||||
pub outgoing_edges: Vec<Edge<EdgeLabel>>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Graph<NodeLabel, EdgeLabel> {
|
||||
pub nodes: Vec<Node<NodeLabel, EdgeLabel>>,
|
||||
}
|
||||
|
||||
impl<NodeLabel, EdgeLabel> Graph<NodeLabel, EdgeLabel> {
|
||||
pub fn new(node_labels: impl Iterator<Item = NodeLabel>) -> Self {
|
||||
let nodes = node_labels
|
||||
.map(|label| Node {
|
||||
label,
|
||||
outgoing_edges: Vec::new(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Graph { nodes }
|
||||
}
|
||||
|
||||
pub fn add_edge(
|
||||
&mut self,
|
||||
tail_idx: usize,
|
||||
head_idx: usize,
|
||||
label: EdgeLabel,
|
||||
) {
|
||||
assert!(head_idx < self.nodes.len());
|
||||
self.nodes[tail_idx]
|
||||
.outgoing_edges
|
||||
.push(Edge { label, head_idx });
|
||||
}
|
||||
|
||||
pub fn reverse(&mut self) {
|
||||
let old_edges: Vec<_> = self
|
||||
.nodes
|
||||
.iter_mut()
|
||||
.map(|node| std::mem::take(&mut node.outgoing_edges))
|
||||
.collect();
|
||||
|
||||
for (tail_idx, edges) in old_edges.into_iter().enumerate() {
|
||||
for e in edges.into_iter() {
|
||||
self.add_edge(e.head_idx, tail_idx, e.label);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub enum SideEffect {
|
||||
/// No side effect (ALU-like)
|
||||
None,
|
||||
|
||||
/// Instruction reads or writes memory
|
||||
///
|
||||
/// This will be serialized with respect to other
|
||||
/// SideEffect::Memory instructions
|
||||
Memory,
|
||||
|
||||
/// This instcuction is a full code motion barrier
|
||||
///
|
||||
/// No other instruction will be re-ordered with respect to this one
|
||||
Barrier,
|
||||
}
|
||||
|
||||
pub fn side_effect_type(op: &Op) -> SideEffect {
|
||||
match op {
|
||||
// Float ALU
|
||||
Op::F2FP(_)
|
||||
| Op::FAdd(_)
|
||||
| Op::FFma(_)
|
||||
| Op::FMnMx(_)
|
||||
| Op::FMul(_)
|
||||
| Op::FSet(_)
|
||||
| Op::FSetP(_)
|
||||
| Op::HAdd2(_)
|
||||
| Op::HFma2(_)
|
||||
| Op::HMul2(_)
|
||||
| Op::HSet2(_)
|
||||
| Op::HSetP2(_)
|
||||
| Op::HMnMx2(_)
|
||||
| Op::FSwzAdd(_) => SideEffect::None,
|
||||
|
||||
// Multi-function unit
|
||||
Op::Rro(_) | Op::MuFu(_) => SideEffect::None,
|
||||
|
||||
// Double-precision float ALU
|
||||
Op::DAdd(_)
|
||||
| Op::DFma(_)
|
||||
| Op::DMnMx(_)
|
||||
| Op::DMul(_)
|
||||
| Op::DSetP(_) => SideEffect::None,
|
||||
|
||||
// Integer ALU
|
||||
Op::BRev(_)
|
||||
| Op::Flo(_)
|
||||
| Op::PopC(_)
|
||||
| Op::IMad(_)
|
||||
| Op::IMul(_)
|
||||
| Op::BMsk(_)
|
||||
| Op::IAbs(_)
|
||||
| Op::IAdd2(_)
|
||||
| Op::IAdd2X(_)
|
||||
| Op::IAdd3(_)
|
||||
| Op::IAdd3X(_)
|
||||
| Op::IDp4(_)
|
||||
| Op::IMad64(_)
|
||||
| Op::IMnMx(_)
|
||||
| Op::ISetP(_)
|
||||
| Op::Lea(_)
|
||||
| Op::LeaX(_)
|
||||
| Op::Lop2(_)
|
||||
| Op::Lop3(_)
|
||||
| Op::Shf(_)
|
||||
| Op::Shl(_)
|
||||
| Op::Shr(_)
|
||||
| Op::Bfe(_) => SideEffect::None,
|
||||
|
||||
// Conversions
|
||||
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => {
|
||||
SideEffect::None
|
||||
}
|
||||
|
||||
// Move ops
|
||||
Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => SideEffect::None,
|
||||
Op::Shfl(_) => SideEffect::None,
|
||||
|
||||
// Predicate ops
|
||||
Op::PLop3(_) | Op::PSetP(_) => SideEffect::None,
|
||||
|
||||
// Uniform ops
|
||||
Op::R2UR(_) => SideEffect::None,
|
||||
|
||||
// Texture ops
|
||||
Op::Tex(_)
|
||||
| Op::Tld(_)
|
||||
| Op::Tld4(_)
|
||||
| Op::Tmml(_)
|
||||
| Op::Txd(_)
|
||||
| Op::Txq(_) => SideEffect::Memory,
|
||||
|
||||
// Surface ops
|
||||
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory,
|
||||
|
||||
// Memory ops
|
||||
Op::Ipa(_) | Op::Ldc(_) => SideEffect::None,
|
||||
Op::Ld(_)
|
||||
| Op::St(_)
|
||||
| Op::Atom(_)
|
||||
| Op::AL2P(_)
|
||||
| Op::ALd(_)
|
||||
| Op::ASt(_)
|
||||
| Op::CCtl(_)
|
||||
| Op::LdTram(_)
|
||||
| Op::MemBar(_) => SideEffect::Memory,
|
||||
|
||||
// Control-flow ops
|
||||
Op::BClear(_)
|
||||
| Op::Break(_)
|
||||
| Op::BSSy(_)
|
||||
| Op::BSync(_)
|
||||
| Op::SSy(_)
|
||||
| Op::Sync(_)
|
||||
| Op::Brk(_)
|
||||
| Op::PBk(_)
|
||||
| Op::Cont(_)
|
||||
| Op::PCnt(_)
|
||||
| Op::Bra(_)
|
||||
| Op::Exit(_)
|
||||
| Op::WarpSync(_) => SideEffect::Barrier,
|
||||
|
||||
// We don't model the barrier register yet, so serialize these
|
||||
Op::BMov(_) => SideEffect::Memory,
|
||||
|
||||
// Geometry ops
|
||||
Op::Out(_) | Op::OutFinal(_) => SideEffect::Barrier,
|
||||
|
||||
// Miscellaneous ops
|
||||
Op::Bar(_) | Op::CS2R(_) | Op::Isberd(_) | Op::Kill(_) | Op::S2R(_) => {
|
||||
SideEffect::Barrier
|
||||
}
|
||||
Op::PixLd(_) | Op::Nop(_) | Op::Vote(_) => SideEffect::None,
|
||||
|
||||
// Virtual ops
|
||||
Op::Annotate(_)
|
||||
| Op::ParCopy(_)
|
||||
| Op::Swap(_)
|
||||
| Op::Copy(_)
|
||||
| Op::Undef(_) => SideEffect::None,
|
||||
|
||||
Op::SrcBar(_)
|
||||
| Op::Pin(_)
|
||||
| Op::Unpin(_)
|
||||
| Op::PhiSrcs(_)
|
||||
| Op::PhiDsts(_)
|
||||
| Op::RegOut(_) => SideEffect::Barrier,
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to guess how many cycles a variable latency instruction will take
|
||||
///
|
||||
/// These values are based on the cycle estimates from "Dissecting the NVidia
|
||||
/// Turing T4 GPU via Microbenchmarking" https://arxiv.org/pdf/1903.07486
|
||||
/// Memory instructions were copied from L1 data cache latencies.
|
||||
/// For instructions not mentioned in the paper, I made up numbers.
|
||||
/// This could probably be improved.
|
||||
pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 {
|
||||
match op {
|
||||
// Multi-function unit
|
||||
Op::Rro(_) | Op::MuFu(_) => 15,
|
||||
|
||||
// Double-precision float ALU
|
||||
Op::DFma(_) | Op::DSetP(_) => 54,
|
||||
Op::DAdd(_) | Op::DMnMx(_) | Op::DMul(_) => 48,
|
||||
|
||||
// Integer ALU
|
||||
Op::BRev(_) | Op::Flo(_) | Op::PopC(_) => 15,
|
||||
Op::IMad(_) | Op::IMul(_) => {
|
||||
assert!(sm < 70);
|
||||
86
|
||||
}
|
||||
|
||||
// Conversions
|
||||
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => 15,
|
||||
|
||||
// Move ops
|
||||
Op::Shfl(_) => 15,
|
||||
|
||||
// Uniform ops
|
||||
Op::R2UR(_) => 15,
|
||||
|
||||
// Texture ops
|
||||
Op::Tex(_)
|
||||
| Op::Tld(_)
|
||||
| Op::Tld4(_)
|
||||
| Op::Tmml(_)
|
||||
| Op::Txd(_)
|
||||
| Op::Txq(_) => 32,
|
||||
|
||||
// Surface ops
|
||||
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32,
|
||||
|
||||
// Memory ops
|
||||
Op::Ldc(_) => 4,
|
||||
|
||||
Op::Ld(_)
|
||||
| Op::St(_)
|
||||
| Op::Atom(_)
|
||||
| Op::AL2P(_)
|
||||
| Op::ALd(_)
|
||||
| Op::ASt(_)
|
||||
| Op::Ipa(_)
|
||||
| Op::CCtl(_)
|
||||
| Op::LdTram(_)
|
||||
| Op::MemBar(_) => 32,
|
||||
|
||||
// Control-flow ops
|
||||
Op::WarpSync(_) => 16,
|
||||
|
||||
// Barrier
|
||||
Op::BMov(_) => 16,
|
||||
|
||||
// Geometry ops
|
||||
Op::Out(_) | Op::OutFinal(_) => 2,
|
||||
|
||||
// Miscellaneous ops
|
||||
Op::Bar(_)
|
||||
| Op::CS2R(_)
|
||||
| Op::Isberd(_)
|
||||
| Op::Kill(_)
|
||||
| Op::PixLd(_)
|
||||
| Op::S2R(_) => 16,
|
||||
|
||||
_ => panic!("Unknown variable latency op {op}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct NodeLabel {
|
||||
pub cycles_to_end: u32,
|
||||
pub num_uses: u32,
|
||||
|
||||
/// The first cycle that the instruction can begin executing
|
||||
pub ready_cycle: u32,
|
||||
|
||||
pub exec_latency: u32,
|
||||
}
|
||||
|
||||
pub struct EdgeLabel {
|
||||
pub latency: u32,
|
||||
}
|
||||
|
||||
pub type DepGraph = graph::Graph<NodeLabel, EdgeLabel>;
|
||||
|
||||
pub fn calc_statistics(g: &mut DepGraph) -> Vec<usize> {
|
||||
let mut initial_ready_list = Vec::new();
|
||||
for i in (0..g.nodes.len()).rev() {
|
||||
let node = &g.nodes[i];
|
||||
let mut max_delay = 0;
|
||||
for edge in &node.outgoing_edges {
|
||||
assert!(edge.head_idx > i);
|
||||
max_delay = max(
|
||||
max_delay,
|
||||
g.nodes[edge.head_idx].label.cycles_to_end + edge.label.latency,
|
||||
);
|
||||
}
|
||||
let node = &mut g.nodes[i];
|
||||
node.label.cycles_to_end = max_delay;
|
||||
node.label.num_uses = node.outgoing_edges.len().try_into().unwrap();
|
||||
if node.label.num_uses == 0 {
|
||||
initial_ready_list.push(i);
|
||||
}
|
||||
}
|
||||
return initial_ready_list;
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ReadyInstr {
|
||||
cycles_to_end: u32,
|
||||
|
||||
// We use the original instruction order as a final tie-breaker, the idea
|
||||
// being that the original schedule is often not too bad. Since we're
|
||||
// iterating in reverse order, that means scheduling the largest instruciton
|
||||
// index first.
|
||||
pub index: usize,
|
||||
}
|
||||
|
||||
impl ReadyInstr {
|
||||
pub fn new<E>(g: &graph::Graph<NodeLabel, E>, i: usize) -> Self {
|
||||
let label = &g.nodes[i].label;
|
||||
// NodeLabel::cycles_to_end is cycles from the beginning of the
|
||||
// instruction to the top of the block, but we want from the end of the
|
||||
// instruction to the top of the block
|
||||
let cycles_to_end = label.cycles_to_end + label.exec_latency;
|
||||
ReadyInstr {
|
||||
cycles_to_end,
|
||||
index: i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct FutureReadyInstr {
|
||||
/// The first cycle that the instruction can end executing
|
||||
pub ready_cycle: Reverse<u32>,
|
||||
pub index: usize,
|
||||
}
|
||||
|
||||
impl FutureReadyInstr {
|
||||
pub fn new<E>(g: &graph::Graph<NodeLabel, E>, i: usize) -> Self {
|
||||
let label = &g.nodes[i].label;
|
||||
// NodeLabel::ready_cycle is the earliest beginning cycle for the
|
||||
// instruction, but we need the earliest end cycle for the instruction.
|
||||
let ready_cycle = label.ready_cycle.saturating_sub(label.exec_latency);
|
||||
FutureReadyInstr {
|
||||
ready_cycle: Reverse(ready_cycle),
|
||||
index: i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn save_graphviz(
|
||||
instrs: &[Box<Instr>],
|
||||
g: &DepGraph,
|
||||
) -> std::io::Result<()> {
|
||||
// dot /tmp/instr_dep_graph.dot -Tsvg > /tmp/instr_dep_graph.svg
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
|
||||
let file = File::create("/tmp/instr_dep_graph.dot")?;
|
||||
let mut w = BufWriter::new(file);
|
||||
|
||||
writeln!(w, "digraph {{")?;
|
||||
for (i, instr) in instrs.iter().enumerate() {
|
||||
let l = &g.nodes[i].label;
|
||||
writeln!(
|
||||
w,
|
||||
" {i} [label=\"{}\\n{}, {}\"];",
|
||||
instr, l.cycles_to_end, l.num_uses
|
||||
)?;
|
||||
}
|
||||
for (i, node) in g.nodes.iter().enumerate() {
|
||||
for j in &node.outgoing_edges {
|
||||
writeln!(
|
||||
w,
|
||||
" {i} -> {} [label=\"{}\"];",
|
||||
j.head_idx, j.label.latency
|
||||
)?;
|
||||
}
|
||||
}
|
||||
writeln!(w, "}}")?;
|
||||
w.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
249
src/nouveau/compiler/nak/opt_instr_sched_postpass.rs
Normal file
249
src/nouveau/compiler/nak/opt_instr_sched_postpass.rs
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
// Copyright © 2024 Valve Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
use crate::ir::*;
|
||||
use crate::opt_instr_sched_common::*;
|
||||
use crate::sched_common::{
|
||||
exec_latency, instr_latency, paw_latency, raw_latency, war_latency,
|
||||
waw_latency, RegTracker,
|
||||
};
|
||||
use std::cmp::max;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
struct RegUse<T: Clone> {
|
||||
reads: Vec<T>,
|
||||
write: Option<T>,
|
||||
}
|
||||
|
||||
impl<T: Clone> RegUse<T> {
|
||||
pub fn new() -> Self {
|
||||
RegUse {
|
||||
reads: Vec::new(),
|
||||
write: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_read(&mut self, dep: T) {
|
||||
self.reads.push(dep);
|
||||
}
|
||||
|
||||
pub fn set_write(&mut self, dep: T) {
|
||||
self.write = Some(dep);
|
||||
self.reads.clear();
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_dep_graph(
|
||||
sm: &dyn ShaderModel,
|
||||
instrs: &Vec<Box<Instr>>,
|
||||
) -> DepGraph {
|
||||
let mut g = DepGraph::new((0..instrs.len()).map(|_| Default::default()));
|
||||
|
||||
// Maps registers to RegUse<ip, src_dst_idx>. Predicates are
|
||||
// represented by src_idx = usize::MAX.
|
||||
let mut uses: Box<RegTracker<RegUse<(usize, usize)>>> =
|
||||
Box::new(RegTracker::new_with(&|| RegUse::new()));
|
||||
|
||||
let mut last_memory_op = None;
|
||||
let mut last_barrier_op = None;
|
||||
|
||||
for ip in (0..instrs.len()).rev() {
|
||||
let instr = &instrs[ip];
|
||||
|
||||
if let Some(bar_ip) = last_barrier_op {
|
||||
g.add_edge(ip, bar_ip, EdgeLabel { latency: 0 });
|
||||
}
|
||||
|
||||
match side_effect_type(&instr.op) {
|
||||
SideEffect::None => (),
|
||||
SideEffect::Barrier => {
|
||||
let last_ip = last_barrier_op.unwrap_or(instrs.len());
|
||||
for other_ip in (ip + 1)..last_ip {
|
||||
g.add_edge(ip, other_ip, EdgeLabel { latency: 0 });
|
||||
}
|
||||
last_barrier_op = Some(ip);
|
||||
}
|
||||
SideEffect::Memory => {
|
||||
if let Some(mem_ip) = last_memory_op {
|
||||
g.add_edge(ip, mem_ip, EdgeLabel { latency: 0 });
|
||||
}
|
||||
last_memory_op = Some(ip);
|
||||
}
|
||||
}
|
||||
|
||||
uses.for_each_instr_dst_mut(instr, |i, u| {
|
||||
if let Some((w_ip, w_dst_idx)) = u.write {
|
||||
let latency = waw_latency(
|
||||
sm.sm(),
|
||||
&instr.op,
|
||||
i,
|
||||
&instrs[w_ip].op,
|
||||
w_dst_idx,
|
||||
);
|
||||
g.add_edge(ip, w_ip, EdgeLabel { latency });
|
||||
}
|
||||
|
||||
for &(r_ip, r_src_idx) in &u.reads {
|
||||
let mut latency = if r_src_idx == usize::MAX {
|
||||
paw_latency(sm.sm(), &instr.op, i)
|
||||
} else {
|
||||
raw_latency(
|
||||
sm.sm(),
|
||||
&instr.op,
|
||||
i,
|
||||
&instrs[r_ip].op,
|
||||
r_src_idx,
|
||||
)
|
||||
};
|
||||
if !instr.has_fixed_latency(sm.sm()) {
|
||||
latency = max(
|
||||
latency,
|
||||
estimate_variable_latency(sm.sm(), &instr.op),
|
||||
);
|
||||
}
|
||||
g.add_edge(ip, r_ip, EdgeLabel { latency });
|
||||
}
|
||||
});
|
||||
uses.for_each_instr_src_mut(instr, |i, u| {
|
||||
if let Some((w_ip, w_dst_idx)) = u.write {
|
||||
let latency = war_latency(
|
||||
sm.sm(),
|
||||
&instr.op,
|
||||
i,
|
||||
&instrs[w_ip].op,
|
||||
w_dst_idx,
|
||||
);
|
||||
g.add_edge(ip, w_ip, EdgeLabel { latency });
|
||||
}
|
||||
});
|
||||
|
||||
// We're iterating in reverse, so writes are logically first
|
||||
uses.for_each_instr_dst_mut(instr, |i, c| {
|
||||
c.set_write((ip, i));
|
||||
});
|
||||
uses.for_each_instr_pred_mut(instr, |c| {
|
||||
c.add_read((ip, usize::MAX));
|
||||
});
|
||||
uses.for_each_instr_src_mut(instr, |i, c| {
|
||||
c.add_read((ip, i));
|
||||
});
|
||||
|
||||
// Initialize this node's distance to the end
|
||||
let mut ready_cycle = (0..instr.dsts().len())
|
||||
.map(|i| instr_latency(sm.sm(), &instr.op, i))
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
if !instr.has_fixed_latency(sm.sm()) {
|
||||
let var_latency = estimate_variable_latency(sm.sm(), &instr.op)
|
||||
+ exec_latency(sm.sm(), &instrs[instrs.len() - 1].op);
|
||||
ready_cycle = max(ready_cycle, var_latency);
|
||||
}
|
||||
let label = &mut g.nodes[ip].label;
|
||||
label.exec_latency = exec_latency(sm.sm(), &instr.op);
|
||||
label.ready_cycle = ready_cycle;
|
||||
}
|
||||
|
||||
g
|
||||
}
|
||||
|
||||
fn generate_order(g: &mut DepGraph, init_ready_list: Vec<usize>) -> Vec<usize> {
|
||||
let mut ready_instrs: BinaryHeap<ReadyInstr> = BinaryHeap::new();
|
||||
let mut future_ready_instrs: BinaryHeap<FutureReadyInstr> = init_ready_list
|
||||
.into_iter()
|
||||
.map(|i| FutureReadyInstr::new(g, i))
|
||||
.collect();
|
||||
|
||||
let mut current_cycle = 0;
|
||||
let mut instr_order = Vec::with_capacity(g.nodes.len());
|
||||
loop {
|
||||
// Move ready instructions to the ready list
|
||||
loop {
|
||||
let Some(fri) = future_ready_instrs.peek() else {
|
||||
break;
|
||||
};
|
||||
if current_cycle < fri.ready_cycle.0 {
|
||||
break;
|
||||
}
|
||||
ready_instrs.push(ReadyInstr::new(g, fri.index));
|
||||
future_ready_instrs.pop();
|
||||
}
|
||||
|
||||
// Pick a ready instruction
|
||||
let next_idx = match ready_instrs.pop() {
|
||||
None => match future_ready_instrs.peek() {
|
||||
None => break, // Both lists are empty. We're done!
|
||||
Some(&FutureReadyInstr {
|
||||
ready_cycle: Reverse(ready_cycle),
|
||||
..
|
||||
}) => {
|
||||
// Fast-forward time to when the next instr is ready
|
||||
assert!(ready_cycle > current_cycle);
|
||||
current_cycle = ready_cycle;
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Some(ReadyInstr { index, .. }) => index,
|
||||
};
|
||||
|
||||
// Schedule the instuction
|
||||
instr_order.push(next_idx);
|
||||
current_cycle += g.nodes[next_idx].label.exec_latency;
|
||||
|
||||
let outgoing_edges =
|
||||
std::mem::take(&mut g.nodes[next_idx].outgoing_edges);
|
||||
for edge in outgoing_edges.into_iter() {
|
||||
let dep_instr = &mut g.nodes[edge.head_idx].label;
|
||||
dep_instr.ready_cycle =
|
||||
max(dep_instr.ready_cycle, current_cycle + edge.label.latency);
|
||||
dep_instr.num_uses -= 1;
|
||||
if dep_instr.num_uses == 0 {
|
||||
future_ready_instrs
|
||||
.push(FutureReadyInstr::new(g, edge.head_idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
return instr_order;
|
||||
}
|
||||
|
||||
fn sched_buffer(
|
||||
sm: &dyn ShaderModel,
|
||||
instrs: Vec<Box<Instr>>,
|
||||
) -> impl Iterator<Item = Box<Instr>> {
|
||||
let mut g = generate_dep_graph(sm, &instrs);
|
||||
let init_ready_list = calc_statistics(&mut g);
|
||||
// save_graphviz(&instrs, &g).unwrap();
|
||||
g.reverse();
|
||||
let new_order = generate_order(&mut g, init_ready_list);
|
||||
|
||||
// Apply the new instruction order
|
||||
let mut instrs: Vec<Option<Box<Instr>>> =
|
||||
instrs.into_iter().map(|instr| Some(instr)).collect();
|
||||
new_order.into_iter().rev().map(move |i| {
|
||||
std::mem::take(&mut instrs[i]).expect("Instruction scheduled twice")
|
||||
})
|
||||
}
|
||||
|
||||
impl Function {
|
||||
pub fn opt_instr_sched_postpass(&mut self, sm: &dyn ShaderModel) {
|
||||
for block in &mut self.blocks {
|
||||
let orig_instr_count = block.instrs.len();
|
||||
let instrs = std::mem::take(&mut block.instrs);
|
||||
block.instrs = sched_buffer(sm, instrs).collect();
|
||||
assert_eq!(orig_instr_count, block.instrs.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Shader<'_> {
|
||||
/// Post-RA instruction scheduling
|
||||
///
|
||||
/// Uses the popular latency-weighted-depth heuristic.
|
||||
/// See eg. Cooper & Torczon's "Engineering A Compiler", 3rd ed.
|
||||
/// Chapter 12.3 "Local scheduling"
|
||||
pub fn opt_instr_sched_postpass(&mut self) {
|
||||
for f in &mut self.functions {
|
||||
f.opt_instr_sched_postpass(self.sm);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue