diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 1de539a5606..fc9d35e2bd9 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -9,7 +9,6 @@ use nak_bindings::*; pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder}; use crate::legalize::LegalizeBuilder; -use crate::sched_common; use crate::sph::{OutputTopology, PixelImap}; use compiler::as_slice::*; use compiler::cfg::CFG; @@ -7505,45 +7504,41 @@ pub trait ShaderModel { !op.has_fixed_latency(self.sm()) } + /// Latency before another non-NOP can execute fn exec_latency(&self, op: &Op) -> u32; + /// Read-after-read latency fn raw_latency( &self, write: &Op, dst_idx: usize, read: &Op, src_idx: usize, - ) -> u32 { - sched_common::raw_latency(self.sm(), write, dst_idx, read, src_idx) - } + ) -> u32; + /// Write-after-read latency fn war_latency( &self, read: &Op, src_idx: usize, write: &Op, dst_idx: usize, - ) -> u32 { - sched_common::war_latency(self.sm(), read, src_idx, write, dst_idx) - } + ) -> u32; + /// Write-after-write latency fn waw_latency( &self, a: &Op, a_dst_idx: usize, b: &Op, b_dst_idx: usize, - ) -> u32 { - sched_common::waw_latency(self.sm(), a, a_dst_idx, b, b_dst_idx) - } + ) -> u32; - fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 { - sched_common::paw_latency(self.sm(), write, dst_idx) - } + /// Predicate read-after-write latency + fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32; - fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { - sched_common::instr_latency(self.sm(), write, dst_idx) - } + /// Worst-case access-after-write latency + fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32; fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op); fn encode_shader(&self, s: &Shader<'_>) -> Vec; diff --git a/src/nouveau/compiler/nak/sched_common.rs b/src/nouveau/compiler/nak/sched_common.rs index 7b05c147b40..61116e2b718 100644 --- a/src/nouveau/compiler/nak/sched_common.rs +++ b/src/nouveau/compiler/nak/sched_common.rs @@ -5,100 +5,6 @@ use crate::ir::*; use std::ops::{Index, IndexMut, Range}; -pub fn instr_latency(sm: u8, op: &Op, dst_idx: usize) -> u32 { - let file = match op.dsts_as_slice()[dst_idx] { - Dst::None => return 0, - Dst::SSA(vec) => vec.file().unwrap(), - Dst::Reg(reg) => reg.file(), - }; - - let (gpr_latency, pred_latency) = if sm < 80 { - match op { - // Double-precision float ALU - Op::DAdd(_) - | Op::DFma(_) - | Op::DMnMx(_) - | Op::DMul(_) - | Op::DSetP(_) - // Half-precision float ALU - | Op::HAdd2(_) - | Op::HFma2(_) - | Op::HMul2(_) - | Op::HSet2(_) - | Op::HSetP2(_) - | Op::HMnMx2(_) => if sm == 70 { - // Volta is even slower - (13, 15) - } else { - (13, 14) - } - _ => (6, 13) - } - } else { - (6, 13) - }; - - // This is BS and we know it - match file { - RegFile::GPR => gpr_latency, - RegFile::UGPR => 12, - RegFile::Pred => pred_latency, - RegFile::UPred => 11, - RegFile::Bar => 0, // Barriers have a HW scoreboard - RegFile::Carry => 6, - RegFile::Mem => panic!("Not a register"), - } -} - -/// Read-after-write latency -pub fn raw_latency( - sm: u8, - write: &Op, - dst_idx: usize, - _read: &Op, - _src_idx: usize, -) -> u32 { - instr_latency(sm, write, dst_idx) -} - -/// Write-after-read latency -pub fn war_latency( - _sm: u8, - _read: &Op, - _src_idx: usize, - _write: &Op, - _dst_idx: usize, -) -> u32 { - // We assume the source gets read in the first 4 cycles. We don't know how - // quickly the write will happen. This is all a guess. - 4 -} - -/// Write-after-write latency -pub fn waw_latency( - sm: u8, - a: &Op, - a_dst_idx: usize, - _b: &Op, - _b_dst_idx: usize, -) -> u32 { - // We know our latencies are wrong so assume the wrote could happen anywhere - // between 0 and instr_latency(a) cycles - instr_latency(sm, a, a_dst_idx) -} - -/// Predicate read-after-write latency -pub fn paw_latency(sm: u8, write: &Op, _dst_idx: usize) -> u32 { - if sm == 70 { - match write { - Op::DSetP(_) | Op::HSetP2(_) => 15, - _ => 13, - } - } else { - 13 - } -} - pub struct RegTracker { reg: [T; 255], ureg: [T; 63], diff --git a/src/nouveau/compiler/nak/sm50.rs b/src/nouveau/compiler/nak/sm50.rs index 9f06bb7e97f..676278817cd 100644 --- a/src/nouveau/compiler/nak/sm50.rs +++ b/src/nouveau/compiler/nak/sm50.rs @@ -10,6 +10,43 @@ use bitview::*; use std::collections::HashMap; use std::ops::Range; +pub fn instr_latency(_sm: u8, op: &Op, dst_idx: usize) -> u32 { + let file = match op.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + let (gpr_latency, pred_latency) = match op { + // Double-precision float ALU + Op::DAdd(_) + | Op::DFma(_) + | Op::DMnMx(_) + | Op::DMul(_) + | Op::DSetP(_) + // Half-precision float ALU + | Op::HAdd2(_) + | Op::HFma2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HSetP2(_) + | Op::HMnMx2(_) => { + (13, 14) + } + _ => (6, 13) + }; + + // This is BS and we know it + match file { + RegFile::GPR => gpr_latency, + RegFile::Pred => pred_latency, + RegFile::UGPR | RegFile::UPred => panic!("No uniform registers"), + RegFile::Bar => 0, // Barriers have a HW scoreboard + RegFile::Carry => 6, + RegFile::Mem => panic!("Not a register"), + } +} + pub struct ShaderModel50 { sm: u8, } @@ -75,6 +112,55 @@ impl ShaderModel for ShaderModel50 { } } + fn raw_latency( + &self, + write: &Op, + dst_idx: usize, + _read: &Op, + _src_idx: usize, + ) -> u32 { + instr_latency(self.sm, write, dst_idx) + } + + fn war_latency( + &self, + _read: &Op, + _src_idx: usize, + _write: &Op, + _dst_idx: usize, + ) -> u32 { + // We assume the source gets read in the first 4 cycles. We don't know + // how quickly the write will happen. This is all a guess. + 4 + } + + fn waw_latency( + &self, + a: &Op, + a_dst_idx: usize, + _b: &Op, + _b_dst_idx: usize, + ) -> u32 { + // We know our latencies are wrong so assume the wrote could happen + // anywhere between 0 and instr_latency(a) cycles + instr_latency(self.sm, a, a_dst_idx) + } + + fn paw_latency(&self, write: &Op, _dst_idx: usize) -> u32 { + if self.sm == 70 { + match write { + Op::DSetP(_) | Op::HSetP2(_) => 15, + _ => 13, + } + } else { + 13 + } + } + + fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { + instr_latency(self.sm, write, dst_idx) + } + fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) { as_sm50_op_mut(op).legalize(b); } diff --git a/src/nouveau/compiler/nak/sm70.rs b/src/nouveau/compiler/nak/sm70.rs index 512dca2605d..58c9d2f8900 100644 --- a/src/nouveau/compiler/nak/sm70.rs +++ b/src/nouveau/compiler/nak/sm70.rs @@ -18,6 +18,51 @@ impl ShaderModel70 { fn has_uniform_alu(&self) -> bool { self.sm >= 73 } + + fn instr_latency(&self, op: &Op, dst_idx: usize) -> u32 { + let file = match op.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + let (gpr_latency, pred_latency) = if self.sm < 80 { + match op { + // Double-precision float ALU + Op::DAdd(_) + | Op::DFma(_) + | Op::DMnMx(_) + | Op::DMul(_) + | Op::DSetP(_) + // Half-precision float ALU + | Op::HAdd2(_) + | Op::HFma2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HSetP2(_) + | Op::HMnMx2(_) => if self.sm == 70 { + // Volta is even slower + (13, 15) + } else { + (13, 14) + } + _ => (6, 13) + } + } else { + (6, 13) + }; + + // This is BS and we know it + match file { + RegFile::GPR => gpr_latency, + RegFile::UGPR => 12, + RegFile::Pred => pred_latency, + RegFile::UPred => 11, + RegFile::Bar => 0, // Barriers have a HW scoreboard + RegFile::Carry => 6, + RegFile::Mem => panic!("Not a register"), + } + } } impl ShaderModel for ShaderModel70 { @@ -117,6 +162,55 @@ impl ShaderModel for ShaderModel70 { } } + fn raw_latency( + &self, + write: &Op, + dst_idx: usize, + _read: &Op, + _src_idx: usize, + ) -> u32 { + self.instr_latency(write, dst_idx) + } + + fn war_latency( + &self, + _read: &Op, + _src_idx: usize, + _write: &Op, + _dst_idx: usize, + ) -> u32 { + // We assume the source gets read in the first 4 cycles. We don't know + // how quickly the write will happen. This is all a guess. + 4 + } + + fn waw_latency( + &self, + a: &Op, + a_dst_idx: usize, + _b: &Op, + _b_dst_idx: usize, + ) -> u32 { + // We know our latencies are wrong so assume the wrote could happen + // anywhere between 0 and instr_latency(a) cycles + self.instr_latency(a, a_dst_idx) + } + + fn paw_latency(&self, write: &Op, _dst_idx: usize) -> u32 { + if self.sm == 70 { + match write { + Op::DSetP(_) | Op::HSetP2(_) => 15, + _ => 13, + } + } else { + 13 + } + } + + fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { + self.instr_latency(write, dst_idx) + } + fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) { legalize_sm70_op(self, b, op); }