diff --git a/src/nouveau/compiler/nak/calc_instr_deps.rs b/src/nouveau/compiler/nak/calc_instr_deps.rs index 44af86ef5e0..deaeadf459c 100644 --- a/src/nouveau/compiler/nak/calc_instr_deps.rs +++ b/src/nouveau/compiler/nak/calc_instr_deps.rs @@ -17,18 +17,6 @@ struct RegTracker { carry: [T; 1], } -impl RegTracker { - pub fn new(v: T) -> Self { - Self { - reg: [v; 255], - ureg: [v; 63], - pred: [v; 7], - upred: [v; 7], - carry: [v; 1], - } - } -} - fn new_array_with(f: &impl Fn() -> T) -> [T; N] { let mut v = Vec::new(); for _ in 0..N { @@ -64,12 +52,12 @@ impl RegTracker { pub fn for_each_instr_src_mut( &mut self, instr: &Instr, - mut f: impl FnMut(&mut T), + mut f: impl FnMut(usize, &mut T), ) { - for src in instr.srcs() { + for (i, src) in instr.srcs().iter().enumerate() { if let SrcRef::Reg(reg) = &src.src_ref { - for i in &mut self[*reg] { - f(i); + for t in &mut self[*reg] { + f(i, t); } } } @@ -78,12 +66,12 @@ impl RegTracker { pub fn for_each_instr_dst_mut( &mut self, instr: &Instr, - mut f: impl FnMut(&mut T), + mut f: impl FnMut(usize, &mut T), ) { - for dst in instr.dsts() { + for (i, dst) in instr.dsts().iter().enumerate() { if let Dst::Reg(reg) = dst { - for i in &mut self[*reg] { - f(i); + for t in &mut self[*reg] { + f(i, t); } } } @@ -133,14 +121,14 @@ impl IndexMut for RegTracker { } #[derive(Clone)] -enum RegUse { +enum RegUse { None, - Write(usize), - Reads(Vec), + Write(T), + Reads(Vec), } -impl RegUse { - pub fn deps(&self) -> &[usize] { +impl RegUse { + pub fn deps(&self) -> &[T] { match self { RegUse::None => &[], RegUse::Write(dep) => slice::from_ref(dep), @@ -148,11 +136,11 @@ impl RegUse { } } - pub fn clear(&mut self) -> RegUse { + pub fn clear(&mut self) -> Self { std::mem::replace(self, RegUse::None) } - pub fn clear_write(&mut self) -> RegUse { + pub fn clear_write(&mut self) -> Self { if matches!(self, RegUse::Write(_)) { std::mem::replace(self, RegUse::None) } else { @@ -160,7 +148,7 @@ impl RegUse { } } - pub fn add_read(&mut self, dep: usize) -> RegUse { + pub fn add_read(&mut self, dep: T) -> Self { match self { RegUse::None => { *self = RegUse::Reads(vec![dep]); @@ -176,7 +164,7 @@ impl RegUse { } } - pub fn set_write(&mut self, dep: usize) -> RegUse { + pub fn set_write(&mut self, dep: T) -> Self { std::mem::replace(self, RegUse::Write(dep)) } } @@ -383,17 +371,17 @@ fn assign_barriers(f: &mut Function, sm: u8) { if instr.has_fixed_latency(sm) { // Delays will cover us here. We just need to make sure // that we wait on any uses that we consume. - uses.for_each_instr_src_mut(instr, |u| { + uses.for_each_instr_src_mut(instr, |_, u| { let u = u.clear_write(); waits.extend_from_slice(u.deps()); }); - uses.for_each_instr_dst_mut(instr, |u| { + uses.for_each_instr_dst_mut(instr, |_, u| { let u = u.clear(); waits.extend_from_slice(u.deps()); }); } else { let (rd, wr) = deps.add_instr(bi, ip); - uses.for_each_instr_src_mut(instr, |u| { + uses.for_each_instr_src_mut(instr, |_, u| { // Only mark a dep as signaled if we actually have // something that shows up in the register file as // needing scoreboarding @@ -401,7 +389,7 @@ fn assign_barriers(f: &mut Function, sm: u8) { let u = u.add_read(rd); waits.extend_from_slice(u.deps()); }); - uses.for_each_instr_dst_mut(instr, |u| { + uses.for_each_instr_dst_mut(instr, |_, u| { // Only mark a dep as signaled if we actually have // something that shows up in the register file as // needing scoreboarding @@ -464,30 +452,160 @@ fn assign_barriers(f: &mut Function, sm: u8) { } } +fn exec_latency(sm: u8, op: &Op) -> u32 { + match op { + Op::Bar(_) | Op::MemBar(_) => { + if sm >= 80 { + 6 + } else { + 5 + } + } + Op::CCtl(_op) => { + // CCTL.C needs 8, CCTL.I needs 11 + 11 + } + // Op::DepBar(_) => 4, + _ => 1, // TODO: co-issue + } +} + +fn instr_latency(op: &Op, dst_idx: usize) -> u32 { + let file = match op.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + // This is BS and we know it + if file.is_predicate() { + 13 + } else { + 6 + } +} + +/// Read-after-write latency +fn raw_latency( + _sm: u8, + write: &Op, + dst_idx: usize, + _read: &Op, + _src_idx: usize, +) -> u32 { + instr_latency(write, dst_idx) +} + +/// Write-after-read latency +fn war_latency( + _sm: u8, + _read: &Op, + _src_idx: usize, + _write: &Op, + _dst_idx: usize, +) -> u32 { + // We assume the source gets read in the first 4 cycles. We don't know how + // quickly the write will happen. This is all a guess. + 4 +} + +/// Write-after-write latency +fn waw_latency( + _sm: u8, + a: &Op, + a_dst_idx: usize, + _b: &Op, + _b_dst_idx: usize, +) -> u32 { + // We know our latencies are wrong so assume the wrote could happen anywhere + // between 0 and instr_latency(a) cycles + instr_latency(a, a_dst_idx) +} + +/// Predicate read-after-write latency +fn paw_latency(_sm: u8, _write: &Op, _dst_idx: usize) -> u32 { + 13 +} + fn calc_delays(f: &mut Function, sm: u8) { for b in f.blocks.iter_mut().rev() { let mut cycle = 0_u32; - let mut reads = RegTracker::new(0_u32); + + // Vector mapping IP to start cycle + let mut instr_cycle = Vec::new(); + instr_cycle.resize(b.instrs.len(), 0_u32); + + // Maps registers to RegUse. Predicates are + // represented by src_idx = usize::MAX. + let mut uses: RegTracker> = + RegTracker::new_with(&|| RegUse::None); + + // Map from barrier to last waited cycle let mut bars = [0_u32; 6]; - for instr in b.instrs.iter_mut().rev() { - // TODO: co-issue - let mut min_start = cycle + instr.get_exec_latency(sm); + + for ip in (0..b.instrs.len()).rev() { + let instr = &b.instrs[ip]; + let mut min_start = cycle + exec_latency(sm, &instr.op); if let Some(bar) = instr.deps.rd_bar() { min_start = max(min_start, bars[usize::from(bar)] + 2); } if let Some(bar) = instr.deps.wr_bar() { min_start = max(min_start, bars[usize::from(bar)] + 2); } - if instr.has_fixed_latency(sm) { - for (idx, dst) in instr.dsts().iter().enumerate() { - if let Dst::Reg(reg) = dst { - let latency = instr.get_dst_latency(sm, idx); - for c in &reads[*reg] { - min_start = max(min_start, *c + latency); - } + uses.for_each_instr_dst_mut(instr, |i, u| match u { + RegUse::None => { + // We don't know how it will be used but it may be used in + // the next block so we need at least assume the maximum + // destination latency from the end of the block. + let s = instr_latency(&instr.op, i); + min_start = max(min_start, s); + } + RegUse::Write((w_ip, w_dst_idx)) => { + let s = instr_cycle[*w_ip] + + waw_latency( + sm, + &instr.op, + i, + &b.instrs[*w_ip].op, + *w_dst_idx, + ); + min_start = max(min_start, s); + } + RegUse::Reads(reads) => { + for (r_ip, r_src_idx) in reads { + let c = instr_cycle[*r_ip]; + let s = if *r_src_idx == usize::MAX { + c + paw_latency(sm, &instr.op, i) + } else { + c + raw_latency( + sm, + &instr.op, + i, + &b.instrs[*r_ip].op, + *r_src_idx, + ) + }; + min_start = max(min_start, s); } } - } + }); + uses.for_each_instr_src_mut(instr, |i, u| match u { + RegUse::None => (), + RegUse::Write((w_ip, w_dst_idx)) => { + let s = instr_cycle[*w_ip] + + war_latency( + sm, + &instr.op, + i, + &b.instrs[*w_ip].op, + *w_dst_idx, + ); + min_start = max(min_start, s); + } + RegUse::Reads(_) => (), + }); + + let instr = &mut b.instrs[ip]; let delay = min_start - cycle; let delay = delay @@ -496,8 +614,16 @@ fn calc_delays(f: &mut Function, sm: u8) { .unwrap(); instr.deps.set_delay(delay); - reads.for_each_instr_pred_mut(instr, |c| *c = min_start); - reads.for_each_instr_src_mut(instr, |c| *c = min_start); + instr_cycle[ip] = min_start; + uses.for_each_instr_pred_mut(instr, |c| { + c.add_read((ip, usize::MAX)); + }); + uses.for_each_instr_src_mut(instr, |i, c| { + c.add_read((ip, i)); + }); + uses.for_each_instr_dst_mut(instr, |i, c| { + c.set_write((ip, i)); + }); for (bar, c) in bars.iter_mut().enumerate() { if instr.deps.wt_bar_mask & (1 << bar) != 0 { *c = min_start; @@ -516,7 +642,7 @@ fn calc_delays(f: &mut Function, sm: u8) { if matches!(instr.op, Op::SrcBar(_)) { instr.op = Op::Nop(OpNop { label: None }); MappedInstrs::One(instr) - } else if instr.get_exec_latency(sm) > 1 { + } else if exec_latency(sm, &instr.op) > 1 { let mut nop = Instr::new_boxed(OpNop { label: None }); nop.deps.set_delay(2); MappedInstrs::Many(vec![instr, nop]) diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index ef27e9ed5ac..5a07ca29656 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -5832,39 +5832,6 @@ impl Instr { } } - /// Minimum latency before another instruction can execute - pub fn get_exec_latency(&self, sm: u8) -> u32 { - match &self.op { - Op::Bar(_) | Op::MemBar(_) => { - if sm >= 80 { - 6 - } else { - 5 - } - } - Op::CCtl(_op) => { - // CCTL.C needs 8, CCTL.I needs 11 - 11 - } - // Op::DepBar(_) => 4, - _ => 1, // TODO: co-issue - } - } - - pub fn get_dst_latency(&self, sm: u8, dst_idx: usize) -> u32 { - debug_assert!(self.has_fixed_latency(sm)); - let file = match self.dsts()[dst_idx] { - Dst::None => return 0, - Dst::SSA(vec) => vec.file().unwrap(), - Dst::Reg(reg) => reg.file(), - }; - if file.is_predicate() { - 13 - } else { - 6 - } - } - pub fn needs_yield(&self) -> bool { matches!(&self.op, Op::Bar(_) | Op::BSync(_)) }