From 79d0f8263da40bc4ecc75861e3f4ca1a4af2af39 Mon Sep 17 00:00:00 2001 From: Mel Henning Date: Mon, 18 Nov 2024 12:26:51 -0500 Subject: [PATCH] nak: Add a simple postpass instruction scheduler To get us started, this is designed to be pretty much the simplest thing possible. It runs post-RA so we don't need to worry about hurting occupancy and it uses the classic textbook algorithm for local (single block) scheduling with the usual latency-weighted-depth heuristic. -14.22% static cycle count on shaderdb Part-of: --- src/nouveau/compiler/nak/api.rs | 1 + src/nouveau/compiler/nak/lib.rs | 2 + .../compiler/nak/opt_instr_sched_common.rs | 413 ++++++++++++++++++ .../compiler/nak/opt_instr_sched_postpass.rs | 249 +++++++++++ 4 files changed, 665 insertions(+) create mode 100644 src/nouveau/compiler/nak/opt_instr_sched_common.rs create mode 100644 src/nouveau/compiler/nak/opt_instr_sched_postpass.rs diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs index f81edb995a3..b20ef2baff9 100644 --- a/src/nouveau/compiler/nak/api.rs +++ b/src/nouveau/compiler/nak/api.rs @@ -439,6 +439,7 @@ fn nak_compile_shader_internal( s.remove_annotations(); + pass!(s, opt_instr_sched_postpass); pass!(s, calc_instr_deps); s.gather_info(); diff --git a/src/nouveau/compiler/nak/lib.rs b/src/nouveau/compiler/nak/lib.rs index eeee20e0adf..2f6157a2407 100644 --- a/src/nouveau/compiler/nak/lib.rs +++ b/src/nouveau/compiler/nak/lib.rs @@ -16,6 +16,8 @@ mod opt_bar_prop; mod opt_copy_prop; mod opt_crs; mod opt_dce; +mod opt_instr_sched_common; +mod opt_instr_sched_postpass; mod opt_jump_thread; mod opt_lop; mod opt_out; diff --git a/src/nouveau/compiler/nak/opt_instr_sched_common.rs b/src/nouveau/compiler/nak/opt_instr_sched_common.rs new file mode 100644 index 00000000000..ef54bc7c38a --- /dev/null +++ b/src/nouveau/compiler/nak/opt_instr_sched_common.rs @@ -0,0 +1,413 @@ +// Copyright © 2024 Valve Corporation +// SPDX-License-Identifier: MIT + +use crate::ir::*; +use std::cmp::max; +use std::cmp::Reverse; + +pub mod graph { + #[derive(Clone)] + pub struct Edge { + pub label: EdgeLabel, + pub head_idx: usize, + } + + #[derive(Clone)] + pub struct Node { + pub label: NodeLabel, + pub outgoing_edges: Vec>, + } + + #[derive(Clone)] + pub struct Graph { + pub nodes: Vec>, + } + + impl Graph { + pub fn new(node_labels: impl Iterator) -> Self { + let nodes = node_labels + .map(|label| Node { + label, + outgoing_edges: Vec::new(), + }) + .collect(); + + Graph { nodes } + } + + pub fn add_edge( + &mut self, + tail_idx: usize, + head_idx: usize, + label: EdgeLabel, + ) { + assert!(head_idx < self.nodes.len()); + self.nodes[tail_idx] + .outgoing_edges + .push(Edge { label, head_idx }); + } + + pub fn reverse(&mut self) { + let old_edges: Vec<_> = self + .nodes + .iter_mut() + .map(|node| std::mem::take(&mut node.outgoing_edges)) + .collect(); + + for (tail_idx, edges) in old_edges.into_iter().enumerate() { + for e in edges.into_iter() { + self.add_edge(e.head_idx, tail_idx, e.label); + } + } + } + } +} + +#[derive(Eq, PartialEq)] +pub enum SideEffect { + /// No side effect (ALU-like) + None, + + /// Instruction reads or writes memory + /// + /// This will be serialized with respect to other + /// SideEffect::Memory instructions + Memory, + + /// This instcuction is a full code motion barrier + /// + /// No other instruction will be re-ordered with respect to this one + Barrier, +} + +pub fn side_effect_type(op: &Op) -> SideEffect { + match op { + // Float ALU + Op::F2FP(_) + | Op::FAdd(_) + | Op::FFma(_) + | Op::FMnMx(_) + | Op::FMul(_) + | Op::FSet(_) + | Op::FSetP(_) + | Op::HAdd2(_) + | Op::HFma2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HSetP2(_) + | Op::HMnMx2(_) + | Op::FSwzAdd(_) => SideEffect::None, + + // Multi-function unit + Op::Rro(_) | Op::MuFu(_) => SideEffect::None, + + // Double-precision float ALU + Op::DAdd(_) + | Op::DFma(_) + | Op::DMnMx(_) + | Op::DMul(_) + | Op::DSetP(_) => SideEffect::None, + + // Integer ALU + Op::BRev(_) + | Op::Flo(_) + | Op::PopC(_) + | Op::IMad(_) + | Op::IMul(_) + | Op::BMsk(_) + | Op::IAbs(_) + | Op::IAdd2(_) + | Op::IAdd2X(_) + | Op::IAdd3(_) + | Op::IAdd3X(_) + | Op::IDp4(_) + | Op::IMad64(_) + | Op::IMnMx(_) + | Op::ISetP(_) + | Op::Lea(_) + | Op::LeaX(_) + | Op::Lop2(_) + | Op::Lop3(_) + | Op::Shf(_) + | Op::Shl(_) + | Op::Shr(_) + | Op::Bfe(_) => SideEffect::None, + + // Conversions + Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => { + SideEffect::None + } + + // Move ops + Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => SideEffect::None, + Op::Shfl(_) => SideEffect::None, + + // Predicate ops + Op::PLop3(_) | Op::PSetP(_) => SideEffect::None, + + // Uniform ops + Op::R2UR(_) => SideEffect::None, + + // Texture ops + Op::Tex(_) + | Op::Tld(_) + | Op::Tld4(_) + | Op::Tmml(_) + | Op::Txd(_) + | Op::Txq(_) => SideEffect::Memory, + + // Surface ops + Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory, + + // Memory ops + Op::Ipa(_) | Op::Ldc(_) => SideEffect::None, + Op::Ld(_) + | Op::St(_) + | Op::Atom(_) + | Op::AL2P(_) + | Op::ALd(_) + | Op::ASt(_) + | Op::CCtl(_) + | Op::LdTram(_) + | Op::MemBar(_) => SideEffect::Memory, + + // Control-flow ops + Op::BClear(_) + | Op::Break(_) + | Op::BSSy(_) + | Op::BSync(_) + | Op::SSy(_) + | Op::Sync(_) + | Op::Brk(_) + | Op::PBk(_) + | Op::Cont(_) + | Op::PCnt(_) + | Op::Bra(_) + | Op::Exit(_) + | Op::WarpSync(_) => SideEffect::Barrier, + + // We don't model the barrier register yet, so serialize these + Op::BMov(_) => SideEffect::Memory, + + // Geometry ops + Op::Out(_) | Op::OutFinal(_) => SideEffect::Barrier, + + // Miscellaneous ops + Op::Bar(_) | Op::CS2R(_) | Op::Isberd(_) | Op::Kill(_) | Op::S2R(_) => { + SideEffect::Barrier + } + Op::PixLd(_) | Op::Nop(_) | Op::Vote(_) => SideEffect::None, + + // Virtual ops + Op::Annotate(_) + | Op::ParCopy(_) + | Op::Swap(_) + | Op::Copy(_) + | Op::Undef(_) => SideEffect::None, + + Op::SrcBar(_) + | Op::Pin(_) + | Op::Unpin(_) + | Op::PhiSrcs(_) + | Op::PhiDsts(_) + | Op::RegOut(_) => SideEffect::Barrier, + } +} + +/// Try to guess how many cycles a variable latency instruction will take +/// +/// These values are based on the cycle estimates from "Dissecting the NVidia +/// Turing T4 GPU via Microbenchmarking" https://arxiv.org/pdf/1903.07486 +/// Memory instructions were copied from L1 data cache latencies. +/// For instructions not mentioned in the paper, I made up numbers. +/// This could probably be improved. +pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 { + match op { + // Multi-function unit + Op::Rro(_) | Op::MuFu(_) => 15, + + // Double-precision float ALU + Op::DFma(_) | Op::DSetP(_) => 54, + Op::DAdd(_) | Op::DMnMx(_) | Op::DMul(_) => 48, + + // Integer ALU + Op::BRev(_) | Op::Flo(_) | Op::PopC(_) => 15, + Op::IMad(_) | Op::IMul(_) => { + assert!(sm < 70); + 86 + } + + // Conversions + Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => 15, + + // Move ops + Op::Shfl(_) => 15, + + // Uniform ops + Op::R2UR(_) => 15, + + // Texture ops + Op::Tex(_) + | Op::Tld(_) + | Op::Tld4(_) + | Op::Tmml(_) + | Op::Txd(_) + | Op::Txq(_) => 32, + + // Surface ops + Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32, + + // Memory ops + Op::Ldc(_) => 4, + + Op::Ld(_) + | Op::St(_) + | Op::Atom(_) + | Op::AL2P(_) + | Op::ALd(_) + | Op::ASt(_) + | Op::Ipa(_) + | Op::CCtl(_) + | Op::LdTram(_) + | Op::MemBar(_) => 32, + + // Control-flow ops + Op::WarpSync(_) => 16, + + // Barrier + Op::BMov(_) => 16, + + // Geometry ops + Op::Out(_) | Op::OutFinal(_) => 2, + + // Miscellaneous ops + Op::Bar(_) + | Op::CS2R(_) + | Op::Isberd(_) + | Op::Kill(_) + | Op::PixLd(_) + | Op::S2R(_) => 16, + + _ => panic!("Unknown variable latency op {op}"), + } +} + +#[derive(Default, Clone)] +pub struct NodeLabel { + pub cycles_to_end: u32, + pub num_uses: u32, + + /// The first cycle that the instruction can begin executing + pub ready_cycle: u32, + + pub exec_latency: u32, +} + +pub struct EdgeLabel { + pub latency: u32, +} + +pub type DepGraph = graph::Graph; + +pub fn calc_statistics(g: &mut DepGraph) -> Vec { + let mut initial_ready_list = Vec::new(); + for i in (0..g.nodes.len()).rev() { + let node = &g.nodes[i]; + let mut max_delay = 0; + for edge in &node.outgoing_edges { + assert!(edge.head_idx > i); + max_delay = max( + max_delay, + g.nodes[edge.head_idx].label.cycles_to_end + edge.label.latency, + ); + } + let node = &mut g.nodes[i]; + node.label.cycles_to_end = max_delay; + node.label.num_uses = node.outgoing_edges.len().try_into().unwrap(); + if node.label.num_uses == 0 { + initial_ready_list.push(i); + } + } + return initial_ready_list; +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct ReadyInstr { + cycles_to_end: u32, + + // We use the original instruction order as a final tie-breaker, the idea + // being that the original schedule is often not too bad. Since we're + // iterating in reverse order, that means scheduling the largest instruciton + // index first. + pub index: usize, +} + +impl ReadyInstr { + pub fn new(g: &graph::Graph, i: usize) -> Self { + let label = &g.nodes[i].label; + // NodeLabel::cycles_to_end is cycles from the beginning of the + // instruction to the top of the block, but we want from the end of the + // instruction to the top of the block + let cycles_to_end = label.cycles_to_end + label.exec_latency; + ReadyInstr { + cycles_to_end, + index: i, + } + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct FutureReadyInstr { + /// The first cycle that the instruction can end executing + pub ready_cycle: Reverse, + pub index: usize, +} + +impl FutureReadyInstr { + pub fn new(g: &graph::Graph, i: usize) -> Self { + let label = &g.nodes[i].label; + // NodeLabel::ready_cycle is the earliest beginning cycle for the + // instruction, but we need the earliest end cycle for the instruction. + let ready_cycle = label.ready_cycle.saturating_sub(label.exec_latency); + FutureReadyInstr { + ready_cycle: Reverse(ready_cycle), + index: i, + } + } +} + +#[allow(dead_code)] +pub fn save_graphviz( + instrs: &[Box], + g: &DepGraph, +) -> std::io::Result<()> { + // dot /tmp/instr_dep_graph.dot -Tsvg > /tmp/instr_dep_graph.svg + + use std::fs::File; + use std::io::{BufWriter, Write}; + + let file = File::create("/tmp/instr_dep_graph.dot")?; + let mut w = BufWriter::new(file); + + writeln!(w, "digraph {{")?; + for (i, instr) in instrs.iter().enumerate() { + let l = &g.nodes[i].label; + writeln!( + w, + " {i} [label=\"{}\\n{}, {}\"];", + instr, l.cycles_to_end, l.num_uses + )?; + } + for (i, node) in g.nodes.iter().enumerate() { + for j in &node.outgoing_edges { + writeln!( + w, + " {i} -> {} [label=\"{}\"];", + j.head_idx, j.label.latency + )?; + } + } + writeln!(w, "}}")?; + w.flush()?; + Ok(()) +} diff --git a/src/nouveau/compiler/nak/opt_instr_sched_postpass.rs b/src/nouveau/compiler/nak/opt_instr_sched_postpass.rs new file mode 100644 index 00000000000..eadf1036a01 --- /dev/null +++ b/src/nouveau/compiler/nak/opt_instr_sched_postpass.rs @@ -0,0 +1,249 @@ +// Copyright © 2024 Valve Corporation +// SPDX-License-Identifier: MIT + +use crate::ir::*; +use crate::opt_instr_sched_common::*; +use crate::sched_common::{ + exec_latency, instr_latency, paw_latency, raw_latency, war_latency, + waw_latency, RegTracker, +}; +use std::cmp::max; +use std::cmp::Reverse; +use std::collections::BinaryHeap; + +struct RegUse { + reads: Vec, + write: Option, +} + +impl RegUse { + pub fn new() -> Self { + RegUse { + reads: Vec::new(), + write: None, + } + } + + pub fn add_read(&mut self, dep: T) { + self.reads.push(dep); + } + + pub fn set_write(&mut self, dep: T) { + self.write = Some(dep); + self.reads.clear(); + } +} + +fn generate_dep_graph( + sm: &dyn ShaderModel, + instrs: &Vec>, +) -> DepGraph { + let mut g = DepGraph::new((0..instrs.len()).map(|_| Default::default())); + + // Maps registers to RegUse. Predicates are + // represented by src_idx = usize::MAX. + let mut uses: Box>> = + Box::new(RegTracker::new_with(&|| RegUse::new())); + + let mut last_memory_op = None; + let mut last_barrier_op = None; + + for ip in (0..instrs.len()).rev() { + let instr = &instrs[ip]; + + if let Some(bar_ip) = last_barrier_op { + g.add_edge(ip, bar_ip, EdgeLabel { latency: 0 }); + } + + match side_effect_type(&instr.op) { + SideEffect::None => (), + SideEffect::Barrier => { + let last_ip = last_barrier_op.unwrap_or(instrs.len()); + for other_ip in (ip + 1)..last_ip { + g.add_edge(ip, other_ip, EdgeLabel { latency: 0 }); + } + last_barrier_op = Some(ip); + } + SideEffect::Memory => { + if let Some(mem_ip) = last_memory_op { + g.add_edge(ip, mem_ip, EdgeLabel { latency: 0 }); + } + last_memory_op = Some(ip); + } + } + + uses.for_each_instr_dst_mut(instr, |i, u| { + if let Some((w_ip, w_dst_idx)) = u.write { + let latency = waw_latency( + sm.sm(), + &instr.op, + i, + &instrs[w_ip].op, + w_dst_idx, + ); + g.add_edge(ip, w_ip, EdgeLabel { latency }); + } + + for &(r_ip, r_src_idx) in &u.reads { + let mut latency = if r_src_idx == usize::MAX { + paw_latency(sm.sm(), &instr.op, i) + } else { + raw_latency( + sm.sm(), + &instr.op, + i, + &instrs[r_ip].op, + r_src_idx, + ) + }; + if !instr.has_fixed_latency(sm.sm()) { + latency = max( + latency, + estimate_variable_latency(sm.sm(), &instr.op), + ); + } + g.add_edge(ip, r_ip, EdgeLabel { latency }); + } + }); + uses.for_each_instr_src_mut(instr, |i, u| { + if let Some((w_ip, w_dst_idx)) = u.write { + let latency = war_latency( + sm.sm(), + &instr.op, + i, + &instrs[w_ip].op, + w_dst_idx, + ); + g.add_edge(ip, w_ip, EdgeLabel { latency }); + } + }); + + // We're iterating in reverse, so writes are logically first + uses.for_each_instr_dst_mut(instr, |i, c| { + c.set_write((ip, i)); + }); + uses.for_each_instr_pred_mut(instr, |c| { + c.add_read((ip, usize::MAX)); + }); + uses.for_each_instr_src_mut(instr, |i, c| { + c.add_read((ip, i)); + }); + + // Initialize this node's distance to the end + let mut ready_cycle = (0..instr.dsts().len()) + .map(|i| instr_latency(sm.sm(), &instr.op, i)) + .max() + .unwrap_or(0); + if !instr.has_fixed_latency(sm.sm()) { + let var_latency = estimate_variable_latency(sm.sm(), &instr.op) + + exec_latency(sm.sm(), &instrs[instrs.len() - 1].op); + ready_cycle = max(ready_cycle, var_latency); + } + let label = &mut g.nodes[ip].label; + label.exec_latency = exec_latency(sm.sm(), &instr.op); + label.ready_cycle = ready_cycle; + } + + g +} + +fn generate_order(g: &mut DepGraph, init_ready_list: Vec) -> Vec { + let mut ready_instrs: BinaryHeap = BinaryHeap::new(); + let mut future_ready_instrs: BinaryHeap = init_ready_list + .into_iter() + .map(|i| FutureReadyInstr::new(g, i)) + .collect(); + + let mut current_cycle = 0; + let mut instr_order = Vec::with_capacity(g.nodes.len()); + loop { + // Move ready instructions to the ready list + loop { + let Some(fri) = future_ready_instrs.peek() else { + break; + }; + if current_cycle < fri.ready_cycle.0 { + break; + } + ready_instrs.push(ReadyInstr::new(g, fri.index)); + future_ready_instrs.pop(); + } + + // Pick a ready instruction + let next_idx = match ready_instrs.pop() { + None => match future_ready_instrs.peek() { + None => break, // Both lists are empty. We're done! + Some(&FutureReadyInstr { + ready_cycle: Reverse(ready_cycle), + .. + }) => { + // Fast-forward time to when the next instr is ready + assert!(ready_cycle > current_cycle); + current_cycle = ready_cycle; + continue; + } + }, + Some(ReadyInstr { index, .. }) => index, + }; + + // Schedule the instuction + instr_order.push(next_idx); + current_cycle += g.nodes[next_idx].label.exec_latency; + + let outgoing_edges = + std::mem::take(&mut g.nodes[next_idx].outgoing_edges); + for edge in outgoing_edges.into_iter() { + let dep_instr = &mut g.nodes[edge.head_idx].label; + dep_instr.ready_cycle = + max(dep_instr.ready_cycle, current_cycle + edge.label.latency); + dep_instr.num_uses -= 1; + if dep_instr.num_uses == 0 { + future_ready_instrs + .push(FutureReadyInstr::new(g, edge.head_idx)); + } + } + } + return instr_order; +} + +fn sched_buffer( + sm: &dyn ShaderModel, + instrs: Vec>, +) -> impl Iterator> { + let mut g = generate_dep_graph(sm, &instrs); + let init_ready_list = calc_statistics(&mut g); + // save_graphviz(&instrs, &g).unwrap(); + g.reverse(); + let new_order = generate_order(&mut g, init_ready_list); + + // Apply the new instruction order + let mut instrs: Vec>> = + instrs.into_iter().map(|instr| Some(instr)).collect(); + new_order.into_iter().rev().map(move |i| { + std::mem::take(&mut instrs[i]).expect("Instruction scheduled twice") + }) +} + +impl Function { + pub fn opt_instr_sched_postpass(&mut self, sm: &dyn ShaderModel) { + for block in &mut self.blocks { + let orig_instr_count = block.instrs.len(); + let instrs = std::mem::take(&mut block.instrs); + block.instrs = sched_buffer(sm, instrs).collect(); + assert_eq!(orig_instr_count, block.instrs.len()); + } + } +} + +impl Shader<'_> { + /// Post-RA instruction scheduling + /// + /// Uses the popular latency-weighted-depth heuristic. + /// See eg. Cooper & Torczon's "Engineering A Compiler", 3rd ed. + /// Chapter 12.3 "Local scheduling" + pub fn opt_instr_sched_postpass(&mut self) { + for f in &mut self.functions { + f.opt_instr_sched_postpass(self.sm); + } + } +}