From 6b8a4e6bb73117e1141fe80e6d8fdfe5d2a39d33 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 17 Feb 2025 07:21:41 +1000 Subject: [PATCH] nak: Add Turing latency information This adds the latency information provided by NVIDIA. This is copied from excel spreadsheets provided to Red Hat. This fully passes CTS on Turing TU104 with no regressions. I'm sure future use of some instructions like IMAD may require some changes to this, but it should be functionally complete. Acked-by: Faith Ekstrand Part-of: --- src/nouveau/compiler/nak/ir.rs | 14 + src/nouveau/compiler/nak/lib.rs | 1 + src/nouveau/compiler/nak/sm70.rs | 69 +- .../compiler/nak/sm75_instr_latencies.rs | 1345 +++++++++++++++++ 4 files changed, 1410 insertions(+), 19 deletions(-) create mode 100644 src/nouveau/compiler/nak/sm75_instr_latencies.rs diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index cb21c8bb453..5722433f91d 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -801,6 +801,16 @@ impl SrcRef { } } + pub fn is_bindless_cbuf(&self) -> bool { + match self { + SrcRef::CBuf(cbuf) => match cbuf.buf { + CBuf::BindlessSSA(_) | CBuf::BindlessUGPR(_) => true, + _ => false, + }, + _ => false, + } + } + pub fn is_predicate(&self) -> bool { match self { SrcRef::Zero | SrcRef::Imm32(_) | SrcRef::CBuf(_) => false, @@ -1288,6 +1298,10 @@ impl Src { } } + pub fn is_bindless_cbuf(&self) -> bool { + self.src_ref.is_bindless_cbuf() + } + pub fn is_predicate(&self) -> bool { self.src_ref.is_predicate() } diff --git a/src/nouveau/compiler/nak/lib.rs b/src/nouveau/compiler/nak/lib.rs index 34781685bae..9fd8133dc58 100644 --- a/src/nouveau/compiler/nak/lib.rs +++ b/src/nouveau/compiler/nak/lib.rs @@ -29,6 +29,7 @@ mod repair_ssa; mod sm50; mod sm70; mod sm70_encode; +mod sm75_instr_latencies; mod sph; mod spill_values; mod to_cssa; diff --git a/src/nouveau/compiler/nak/sm70.rs b/src/nouveau/compiler/nak/sm70.rs index 3da28c657c5..f7691dbb831 100644 --- a/src/nouveau/compiler/nak/sm70.rs +++ b/src/nouveau/compiler/nak/sm70.rs @@ -4,6 +4,7 @@ use crate::ir::*; use crate::legalize::LegalizeBuilder; use crate::sm70_encode::*; +use crate::sm75_instr_latencies::SM75Latency; pub struct ShaderModel70 { sm: u8, @@ -144,6 +145,18 @@ impl ShaderModel for ShaderModel70 { } } + fn op_needs_scoreboard(&self, op: &Op) -> bool { + if op.no_scoreboard() { + return false; + } + + if self.is_turing() { + SM75Latency::needs_scoreboards(op) + } else { + !op.has_fixed_latency(self.sm()) + } + } + fn exec_latency(&self, op: &Op) -> u32 { match op { Op::Bar(_) | Op::MemBar(_) => { @@ -166,39 +179,53 @@ impl ShaderModel for ShaderModel70 { &self, write: &Op, dst_idx: usize, - _read: &Op, - _src_idx: usize, + read: &Op, + src_idx: usize, ) -> u32 { - self.instr_latency(write, dst_idx) + if self.is_turing() { + SM75Latency::raw(write, dst_idx, Some(read), src_idx) + } else { + self.instr_latency(write, dst_idx) + } } fn war_latency( &self, - _read: &Op, - _src_idx: usize, - _write: &Op, - _dst_idx: usize, + read: &Op, + src_idx: usize, + write: &Op, + dst_idx: usize, ) -> u32 { - // We assume the source gets read in the first 4 cycles. We don't know - // how quickly the write will happen. This is all a guess. - 4 + if self.is_turing() { + SM75Latency::war(read, src_idx, write, dst_idx) + } else { + // We assume the source gets read in the first 4 cycles. We don't + // know how quickly the write will happen. This is all a guess. + 4 + } } fn waw_latency( &self, a: &Op, a_dst_idx: usize, - _a_has_pred: bool, - _b: &Op, - _b_dst_idx: usize, + a_has_pred: bool, + b: &Op, + b_dst_idx: usize, ) -> u32 { - // We know our latencies are wrong so assume the wrote could happen - // anywhere between 0 and instr_latency(a) cycles - self.instr_latency(a, a_dst_idx) + if self.is_turing() { + SM75Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred) + } else { + // We know our latencies are wrong so assume the wrote could happen + // anywhere between 0 and instr_latency(a) cycles + self.instr_latency(a, a_dst_idx) + } } - fn paw_latency(&self, write: &Op, _dst_idx: usize) -> u32 { - if self.is_volta() { + fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 { + if self.is_turing() { + SM75Latency::raw(write, dst_idx, None, 0) + } else if self.is_volta() { match write { Op::DSetP(_) | Op::HSetP2(_) => 15, _ => 13, @@ -209,7 +236,11 @@ impl ShaderModel for ShaderModel70 { } fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { - self.instr_latency(write, dst_idx) + if self.is_turing() { + SM75Latency::raw(write, dst_idx, None, 0) + } else { + self.instr_latency(write, dst_idx) + } } fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) { diff --git a/src/nouveau/compiler/nak/sm75_instr_latencies.rs b/src/nouveau/compiler/nak/sm75_instr_latencies.rs new file mode 100644 index 00000000000..1a1d422f56c --- /dev/null +++ b/src/nouveau/compiler/nak/sm75_instr_latencies.rs @@ -0,0 +1,1345 @@ +// Copyright © 2025 Red Hat. +// SPDX-License-Identifier: MIT +#![allow(non_camel_case_types)] + +use crate::ir::*; + +// This contains the register scheduling information provided by NVIDIA. This +// file is for Turing only. + +// Coupled instructions are ones with fixed latencies, they need delays but not +// scoreboards. Decoupled instructions are ones with variable latencies, need +// scoreboards but not delays. There are also redirected instructions which +// depending on the SM, can be coupled or decoupled so both delays and +// scoreboards needs to be provided. + +#[allow(dead_code)] +#[derive(Debug)] +enum RegLatencySM75 { + CoupledDisp64, + CoupledDisp, + CoupledAlu, + CoupledFMA, + IMADLo, + IMADWideAB, // readers only + IMADWideLower, + IMADWideUpper, + RedirectedFP64, + RedirectedFP16, + RedirectedHMMA_884_F16, + RedirectedHMMA_884_F32, + RedirectedHMMA_1688, + RedirectedHMMA_16816, + IMMA, + Decoupled, + DecoupledOther, //reads only + BMov, + GuardPredicate, +} + +pub const fn pred(has_pred: bool, a: u32, b: u32) -> u32 { + if has_pred { + a + b + } else { + b + } +} + +impl RegLatencySM75 { + fn op_category(op: &Op, reader: bool, op_reg_idx: usize) -> RegLatencySM75 { + use RegLatencySM75::*; + match op { + // this will need updating if imad grows support for input predicates + Op::IMad(_) | Op::IMul(_) => IMADLo, + Op::IMad64(_) => { + if reader { + match op_reg_idx { + 0 | 1 => IMADWideAB, + 2 => IMADWideLower, // vs upper C operand - work it out + _ => { + panic!("Illegal field in imadwide") + } + } + } else { + IMADWideUpper // as above this needs more work + } + } + + Op::PopC(_) => Decoupled, + Op::IAdd3(_) | Op::IAdd3X(_) => CoupledAlu, + + Op::BMsk(_) => CoupledAlu, + // Sgxt => CoupledAlu, + Op::Lop3(_) => CoupledAlu, + Op::Flo(_) => Decoupled, + Op::ISetP(_) => CoupledAlu, + Op::IAbs(_) => CoupledAlu, + Op::Lea(_) => CoupledAlu, + Op::LeaX(_) => CoupledAlu, + Op::IMnMx(_) => CoupledAlu, + Op::I2I(_) => CoupledAlu, + // I2IP => CoupledAlu + Op::Shf(_) => CoupledAlu, + + Op::FFma(_) => CoupledFMA, + Op::FAdd(_) => CoupledFMA, + Op::FMul(_) => CoupledFMA, + Op::FMnMx(_) => CoupledAlu, + Op::FSwzAdd(_) => CoupledFMA, + Op::FSet(_) => CoupledAlu, + // FSel => CoupledAlu, + Op::FSetP(_) => CoupledAlu, + // FChk => Decoupled, + Op::DAdd(_) | Op::DFma(_) | Op::DMul(_) | Op::DSetP(_) => { + RedirectedFP64 + } + + Op::DMnMx(_) => RedirectedFP64, // not in docs + + Op::HAdd2(_) + | Op::HFma2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HSetP2(_) => RedirectedFP16, + + Op::HMnMx2(_) => RedirectedFP16, // not in docs + // let in for documentation purposes + // Op::Hmma(h) => { + // match h.mat_size { + // HmmaSize::M16N8K4 => match h.dst_type { + // FloatType::F16 => RedirectedHMMA_884_F16, + // _ => RedirectedHMMA_884_F32 + // } + // HmmaSize::M16N8K8 => RedirectedHMMA_1688, + // HmmaSize::M16N8K16 => RedirectedHMMA_16816, + // } + // } + Op::Ipa(_) => Decoupled, + Op::MuFu(_) => Decoupled, + + // Conversion functions all decoupled + Op::F2F(_) => Decoupled, + Op::F2I(_) => Decoupled, + Op::I2F(_) => Decoupled, + Op::FRnd(_) => Decoupled, + Op::AL2P(_) => Decoupled, + + Op::Mov(_) => CoupledAlu, + Op::Sel(_) => CoupledAlu, + Op::BRev(_) => Decoupled, + // P2R => CoupledAlu, + // R2P => CoupledAlu, + Op::PLop3(_) => CoupledAlu, + Op::Prmt(_) => CoupledAlu, + Op::Nop(_) => CoupledDisp, + Op::Vote(_) => CoupledDisp, + Op::S2R(_) => Decoupled, + // S2UR => Decoupled, + Op::R2UR(_) => { + if reader { + Decoupled + } else { + panic!("Illegal R2UR"); + } + } + Op::CS2R(cs2r) => { + if cs2r.dst.as_reg().unwrap().comps() == 2 { + CoupledDisp64 + } else { + CoupledAlu + } + } + // B2R => Decoupled, + // LEPC => CoupledDisp64 + Op::BMov(bmov) => match bmov.dst { + Dst::Reg(reg) => { + if reg.is_gpr() { + BMov + } else { + Decoupled + } + } + _ => Decoupled, + }, + // RPCMOV.32 => CoupledAlu, + // RPCMOV.64 => CoupledDisp64 + // PMTRIG => CoupledDisp64 + // CSMTEST => CoupledAlu, + Op::Bar(_) => Decoupled, + // Remove when Imma added + //Op::Imma(_) => IMMA, + Op::IDp4(_) => CoupledFMA, + Op::BClear(_) => Decoupled, + Op::Bra(_) => Decoupled, + Op::BSSy(_) => Decoupled, + Op::Kill(_) => Decoupled, + Op::Exit(_) => Decoupled, + Op::BSync(_) => Decoupled, + Op::Tex(_) => Decoupled, + Op::Tld(_) => Decoupled, + Op::Tld4(_) => Decoupled, + Op::Tmml(_) => Decoupled, + Op::Txd(_) => Decoupled, + Op::Txq(_) => Decoupled, + Op::Ldc(_) => Decoupled, + Op::ALd(_) => Decoupled, + Op::ASt(_) => Decoupled, + Op::Out(_) => Decoupled, + Op::OutFinal(_) => Decoupled, + Op::Ld(_) => Decoupled, + Op::St(_) => Decoupled, + Op::Atom(_) => Decoupled, + //CCtl.i,c are coupled + Op::CCtl(_) => DecoupledOther, + Op::MemBar(_) => Decoupled, + Op::SuLd(_) => Decoupled, + Op::SuSt(_) => Decoupled, + Op::SuAtom(_) => Decoupled, + Op::PixLd(_) => Decoupled, + Op::Isberd(_) => Decoupled, + Op::LdTram(_) => Decoupled, + Op::Shfl(_) => Decoupled, + //Op::LdSm(_) => Decoupled + x => { + panic!("Illegal instuction in reg category {}", x); + } + } + } + + pub fn read_after_write( + writer: RegLatencySM75, + reader: RegLatencySM75, + ) -> u32 { + use RegLatencySM75::*; + match writer { + IMADWideAB | DecoupledOther => { + panic!("Illegal IMADWideAB for writer"); + } + _ => {} + } + + match reader { + CoupledDisp64 | CoupledDisp | CoupledAlu => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 4, + CoupledFMA | IMADLo => 5, + IMADWideLower => 3, + IMADWideUpper => 5, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + CoupledFMA | IMADLo => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 5, + CoupledFMA | IMADLo => 4, + IMADWideLower => 2, + IMADWideUpper => 4, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + IMADWideAB => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 5, + CoupledFMA | IMADLo => 4, + IMADWideLower => 4, + IMADWideUpper => 6, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + IMADWideLower | IMADWideUpper => match reader { + IMADWideLower => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 5, + CoupledFMA | IMADLo => 4, + IMADWideLower => 2, + IMADWideUpper => 2, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + IMADWideUpper => match writer { + CoupledDisp64 => 4, + CoupledAlu | CoupledDisp => 3, + CoupledFMA | IMADLo => 2, + IMADWideLower => 2, + IMADWideUpper => 2, + RedirectedFP64 => 7, + RedirectedFP16 => 6, + RedirectedHMMA_884_F16 => 11, + RedirectedHMMA_884_F32 => 8, + RedirectedHMMA_1688 => 12, + RedirectedHMMA_16816 => 20, + IMMA => 8, + _ => 1, + }, + _ => { + panic!("Illegal IMAD field"); + } + }, + RedirectedFP64 => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 6, + CoupledFMA | IMADLo => 6, + IMADWideLower => 6, + IMADWideUpper => 6, + RedirectedFP64 => 8, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + RedirectedFP16 => match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 6, + CoupledFMA | IMADLo => 6, + IMADWideLower => 6, + IMADWideUpper => 6, + RedirectedFP64 => 9, + RedirectedFP16 => 6, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + }, + RedirectedHMMA_884_F16 + | RedirectedHMMA_884_F32 + | RedirectedHMMA_1688 + | RedirectedHMMA_16816 + | Decoupled => { + match writer { + CoupledDisp64 => 6, + CoupledAlu | CoupledDisp => 6, + CoupledFMA | IMADLo => 6, + IMADWideLower => 6, + IMADWideUpper => 6, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, //4 for back to back FMA for 884 + RedirectedHMMA_884_F32 => 10, //4 for back o back FMA for 884 + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, + _ => 1, + } + } + IMMA | DecoupledOther => { + match writer { + CoupledDisp64 => 8, + CoupledAlu | CoupledDisp => 8, + CoupledFMA | IMADLo => 8, + IMADWideLower => 8, + IMADWideUpper => 8, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + RedirectedHMMA_884_F16 => 13, + RedirectedHMMA_884_F32 => 10, + RedirectedHMMA_1688 => 14, + RedirectedHMMA_16816 => 22, + IMMA => 10, // 4 for back to back IMMA + _ => 1, + } + } + BMov | GuardPredicate => { + panic!("Not a RAW category") + } + } + } + + fn write_after_write( + writer1: RegLatencySM75, + writer2: RegLatencySM75, + has_pred: bool, + ) -> u32 { + use RegLatencySM75::*; + match writer1 { + IMADWideAB | DecoupledOther => { + panic!("Illegal reg latency for writer"); + } + _ => {} + } + match writer2 { + CoupledDisp64 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => 1, + RedirectedFP64 => 4, + RedirectedFP16 => 3, + RedirectedHMMA_884_F16 => 8, + RedirectedHMMA_884_F32 => pred(has_pred, 2, 2), + RedirectedHMMA_1688 => 9, + RedirectedHMMA_16816 => 17, + IMMA => 5, + _ => 1, + }, + CoupledDisp | CoupledAlu => match writer1 { + CoupledDisp64 => 2, + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideLower | IMADWideUpper => 1, + RedirectedFP64 => pred(has_pred, 4, 1), + RedirectedFP16 => pred(has_pred, 3, 1), + RedirectedHMMA_884_F16 => pred(has_pred, 8, 1), + RedirectedHMMA_884_F32 => pred(has_pred, 5, 1), + RedirectedHMMA_1688 => pred(has_pred, 9, 1), + RedirectedHMMA_16816 => pred(has_pred, 17, 1), + IMMA => pred(has_pred, 5, 1), + _ => 1, + }, + CoupledFMA | IMADLo => match writer1 { + CoupledDisp64 => 2, + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideLower => 1, + IMADWideUpper => pred(has_pred, 1, 1), + RedirectedFP64 => pred(has_pred, 4, 1), + RedirectedFP16 => pred(has_pred, 3, 1), + RedirectedHMMA_884_F16 => pred(has_pred, 8, 1), + RedirectedHMMA_884_F32 => pred(has_pred, 5, 1), + RedirectedHMMA_1688 => pred(has_pred, 9, 1), + RedirectedHMMA_16816 => pred(has_pred, 17, 1), + IMMA => pred(has_pred, 5, 1), + _ => 1, + }, + IMADWideLower => match writer1 { + CoupledDisp64 => pred(has_pred, 2, 2), + CoupledDisp | CoupledAlu => pred(has_pred, 2, 1), + CoupledFMA | IMADLo => pred(has_pred, 1, 1), + IMADWideLower => 1, + IMADWideUpper => 1, + RedirectedFP64 => pred(has_pred, 4, 3), + RedirectedFP16 => pred(has_pred, 3, 3), + RedirectedHMMA_884_F16 => pred(has_pred, 8, 3), + RedirectedHMMA_884_F32 => pred(has_pred, 5, 3), + RedirectedHMMA_1688 => pred(has_pred, 9, 3), + RedirectedHMMA_16816 => pred(has_pred, 17, 3), + IMMA => pred(has_pred, 5, 3), + _ => 1, + }, + IMADWideUpper => match writer1 { + CoupledDisp64 => pred(has_pred, 1, 1), + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideLower | IMADWideUpper => 1, + RedirectedFP64 => pred(has_pred, 4, 1), + RedirectedFP16 => pred(has_pred, 3, 1), + RedirectedHMMA_884_F16 => pred(has_pred, 8, 1), + RedirectedHMMA_884_F32 => pred(has_pred, 5, 1), + RedirectedHMMA_1688 => pred(has_pred, 9, 1), + RedirectedHMMA_16816 => pred(has_pred, 17, 1), + IMMA => pred(has_pred, 5, 1), + _ => 1, + }, + RedirectedFP64 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => 2, + RedirectedFP64 => 1, + RedirectedFP16 => 2, + RedirectedHMMA_884_F16 => 5, + RedirectedHMMA_884_F32 => 2, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + IMMA => 2, + _ => 1, + }, + RedirectedFP16 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => 2, + RedirectedFP64 => pred(has_pred, 1, 1), + RedirectedFP16 => 1, + RedirectedHMMA_884_F16 => pred(has_pred, 6, 1), + RedirectedHMMA_884_F32 => pred(has_pred, 3, 1), + RedirectedHMMA_1688 => pred(has_pred, 7, 1), + RedirectedHMMA_16816 => pred(has_pred, 15, 1), + IMMA => pred(has_pred, 3, 1), + _ => 1, + }, + RedirectedHMMA_884_F16 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => 2, + RedirectedFP64 => pred(has_pred, 3, 2), + RedirectedFP16 => pred(has_pred, 2, 2), + RedirectedHMMA_884_F16 => 1, + RedirectedHMMA_884_F32 => pred(has_pred, 2, 4), + RedirectedHMMA_1688 => pred(has_pred, 6, 4), + RedirectedHMMA_16816 => pred(has_pred, 16, 2), + IMMA => pred(has_pred, 2, 4), + _ => 1, + }, + RedirectedHMMA_884_F32 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => 2, + RedirectedFP64 => pred(has_pred, 3, 2), + RedirectedFP16 => pred(has_pred, 2, 2), + RedirectedHMMA_884_F16 => pred(has_pred, 4, 5), + RedirectedHMMA_884_F32 => 1, + RedirectedHMMA_1688 => pred(has_pred, 6, 4), + RedirectedHMMA_16816 => pred(has_pred, 16, 2), + IMMA => pred(has_pred, 2, 4), + _ => 1, + }, + RedirectedHMMA_1688 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper | RedirectedFP64 + | RedirectedFP16 => 2, + RedirectedHMMA_884_F16 => 4, + RedirectedHMMA_884_F32 => 2, + RedirectedHMMA_1688 => 1, + RedirectedHMMA_16816 => 16, + IMMA => 2, + _ => 1, + }, + RedirectedHMMA_16816 => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper | RedirectedFP64 + | RedirectedFP16 => 2, + RedirectedHMMA_884_F16 => 4, + RedirectedHMMA_884_F32 => 2, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 1, + IMMA => 2, + _ => 1, + }, + IMMA => match writer1 { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA + | IMADLo | IMADWideLower | IMADWideUpper => { + pred(has_pred, 2, 2) + } + RedirectedFP64 => pred(has_pred, 2, 3), + RedirectedFP16 => pred(has_pred, 2, 2), + RedirectedHMMA_884_F16 => pred(has_pred, 2, 7), + RedirectedHMMA_884_F32 => pred(has_pred, 2, 4), + RedirectedHMMA_1688 => pred(has_pred, 6, 4), + RedirectedHMMA_16816 => pred(has_pred, 14, 4), + IMMA => 1, + _ => 1, + }, + Decoupled => match writer1 { + CoupledDisp64 + | CoupledDisp + | CoupledAlu + | CoupledFMA + | IMADLo + | IMADWideLower + | IMADWideUpper + | RedirectedFP64 + | RedirectedFP16 + | RedirectedHMMA_884_F16 + | RedirectedHMMA_884_F32 + | RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + IMMA => 2, + _ => 1, + }, + BMov => { + // BMOV Writing to RF? + match writer1 { + CoupledDisp64 + | CoupledDisp + | CoupledAlu + | CoupledFMA + | IMADLo + | IMADWideLower + | IMADWideUpper + | RedirectedFP64 + | RedirectedFP16 + | RedirectedHMMA_884_F16 + | RedirectedHMMA_884_F32 + | RedirectedHMMA_1688 => 9, + RedirectedHMMA_16816 => 14, + IMMA => 9, + _ => 1, + } + } + IMADWideAB | DecoupledOther | GuardPredicate => { + panic!("Not a WAW category") + } + } + } + + fn write_after_read(reader: RegLatencySM75, writer: RegLatencySM75) -> u32 { + use RegLatencySM75::*; + match writer { + CoupledDisp64 | CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideLower | IMADWideUpper => match reader { + RedirectedHMMA_1688 => 5, + RedirectedHMMA_16816 => 13, + _ => 1, + }, + RedirectedFP64 => match reader { + RedirectedFP64 => 1, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + RedirectedFP16 => match reader { + RedirectedFP16 => 1, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + RedirectedHMMA_884_F16 => match reader { + RedirectedHMMA_884_F16 => 1, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + RedirectedHMMA_884_F32 => match reader { + RedirectedHMMA_884_F32 => 1, + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + RedirectedHMMA_1688 => match reader { + RedirectedHMMA_1688 => 1, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + RedirectedHMMA_16816 => match reader { + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 1, + Decoupled => 1, + _ => 2, + }, + IMMA => match reader { + RedirectedHMMA_1688 => 6, + RedirectedHMMA_16816 => 14, + IMMA => 1, + Decoupled => 1, + _ => 2, + }, + Decoupled => match reader { + RedirectedHMMA_1688 => 2, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 2, + }, + BMov => match reader { + RedirectedHMMA_1688 => 9, + RedirectedHMMA_16816 => 14, + Decoupled => 1, + _ => 9, + }, + IMADWideAB | DecoupledOther | GuardPredicate => { + panic!("Illegal in WAR"); + } + } + } + + fn pred_read_after_write( + writer: RegLatencySM75, + reader: RegLatencySM75, + ) -> u32 { + use RegLatencySM75::*; + match reader { + CoupledDisp => match writer { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => 12, + RedirectedFP64 => 15, + RedirectedFP16 => 14, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + CoupledAlu => match writer { + CoupledDisp | CoupledAlu => 4, + CoupledFMA | IMADLo | IMADWideUpper | IMADWideLower => 5, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + CoupledFMA | IMADLo => match writer { + CoupledDisp | CoupledAlu => 5, + CoupledFMA | IMADLo | IMADWideUpper | IMADWideLower => 4, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + IMADWideUpper | IMADWideLower => match writer { + CoupledDisp | CoupledAlu => 5, + CoupledFMA | IMADLo => 4, + IMADWideUpper | IMADWideLower => 2, + RedirectedFP64 => 9, + RedirectedFP16 => 8, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + RedirectedFP64 => match writer { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => 12, + RedirectedFP64 => 8, + RedirectedFP16 => 14, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + RedirectedFP16 => match writer { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => 12, + RedirectedFP64 => 15, + RedirectedFP16 => 6, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + Decoupled | GuardPredicate => match writer { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => 12, + RedirectedFP64 => 15, + RedirectedFP16 => 14, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + _ => { + panic!("Illegal reader in reg predicate"); + } + } + } + + fn pred_write_after_write( + writer1: RegLatencySM75, + writer2: RegLatencySM75, + has_pred: bool, + ) -> u32 { + use RegLatencySM75::*; + match writer2 { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo => match writer1 { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => 1, + RedirectedFP64 => pred(has_pred, 4, 1), + RedirectedFP16 => pred(has_pred, 3, 1), + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + IMADWideUpper | IMADWideLower => match writer1 { + CoupledDisp | CoupledAlu => pred(has_pred, 1, 2), + CoupledFMA | IMADLo => pred(has_pred, 1, 1), + IMADWideUpper | IMADWideLower => 1, + RedirectedFP64 => pred(has_pred, 4, 3), + RedirectedFP16 => pred(has_pred, 3, 3), + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + RedirectedFP64 => match writer1 { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => pred(has_pred, 2, 2), + RedirectedFP64 => 1, + RedirectedFP16 => pred(has_pred, 2, 4), + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + RedirectedFP16 => match writer1 { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower => pred(has_pred, 2, 4), + RedirectedFP64 => pred(has_pred, 2, 7), + RedirectedFP16 => 1, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + Decoupled => match writer1 { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo + | IMADWideUpper | IMADWideLower | RedirectedFP64 + | RedirectedFP16 => 2, + Decoupled => 1, + _ => { + panic!("Illegal RAW in Predicate"); + } + }, + _ => { + panic!("Illegal WAR category in Predicates"); + } + } + } + + fn pred_write_after_read( + reader: RegLatencySM75, + writer: RegLatencySM75, + ) -> u32 { + use RegLatencySM75::*; + match writer { + CoupledDisp | CoupledAlu | CoupledFMA | IMADLo | IMADWideUpper + | IMADWideLower => 1, + RedirectedFP64 => match reader { + CoupledAlu | CoupledFMA | IMADLo | IMADWideUpper + | IMADWideLower | RedirectedFP16 => 2, + _ => 1, + }, + RedirectedFP16 => match reader { + CoupledAlu | CoupledFMA | IMADLo | IMADWideUpper + | IMADWideLower | RedirectedFP64 => 2, + _ => 1, + }, + Decoupled => match reader { + CoupledAlu | CoupledFMA | IMADLo | IMADWideUpper + | IMADWideLower | RedirectedFP16 | RedirectedFP64 => 2, + _ => 1, + }, + _ => { + panic!("Illegal WAR category in Predicates"); + } + } + } +} + +#[allow(non_camel_case_types)] +#[allow(dead_code)] +#[derive(Debug)] +enum URegLatencySM75 { + Udp, + VectorCoupled, + VectorDecoupled, + Uldc, + Umov, + VectorCoupledBindless, + VectorDecoupledBindless, + VoteU, + GuardPredicate, + R2UR, +} + +impl URegLatencySM75 { + fn op_category( + op: &Op, + reader: bool, + op_reg_idx: usize, + ) -> URegLatencySM75 { + use URegLatencySM75::*; + // is this using a bindless cbuf as a src register. + // this decides between the category types for readers. + let bindless = + reader && op.srcs_as_slice()[op_reg_idx].is_bindless_cbuf(); + + let vcoupled = if bindless { + VectorCoupledBindless + } else { + VectorCoupled + }; + let vdecoupled = if bindless { + VectorDecoupledBindless + } else { + VectorDecoupled + }; + + // if this is a reader from a ureg, it could be a U* instruction or a regular instruction. + let uniform_op = op.is_uniform(); + + let vcoupled = if uniform_op { Udp } else { vcoupled }; + let vdecoupled = if uniform_op { Udp } else { vdecoupled }; + + match op { + Op::BMsk(_) => vcoupled, + Op::BRev(_) => vcoupled, + // uclea? + Op::Flo(_) => vdecoupled, + Op::IAdd3(_) | Op::IAdd3X(_) => vcoupled, + Op::IAbs(_) => vcoupled, + Op::IMnMx(_) => vcoupled, + Op::IMad(_) => vcoupled, + + Op::IMad64(_) => vcoupled, + Op::ISetP(_) => vcoupled, + Op::Ldc(_) => { + if uniform_op { + Uldc + } else { + vdecoupled + } + } + Op::Lea(_) => vcoupled, + Op::LeaX(_) => vcoupled, + Op::Lop2(_) | Op::Lop3(_) => vcoupled, + + Op::MuFu(_) => vdecoupled, + Op::Mov(_) => { + if uniform_op { + Umov + } else { + vcoupled + } + } + + // mov32i => URegLatency::Uldc, + // p2ur => Udp, + Op::PLop3(_) => vcoupled, + Op::PopC(_) => vdecoupled, + Op::Prmt(_) => vcoupled, + Op::PSetP(_) => vcoupled, + // UR2UP + Op::Sel(_) => vcoupled, + // SGXT + Op::Shf(_) => vcoupled, + Op::Shfl(_) => vdecoupled, + + Op::I2F(_) => vdecoupled, + Op::F2I(_) => vdecoupled, + Op::F2F(_) => vdecoupled, + Op::R2UR(_) => { + if !reader { + R2UR + } else { + panic!("Illegal R2UR in ureg"); + } + } + Op::Vote(_) => VoteU, + + Op::FRnd(_) => vdecoupled, + Op::FAdd(_) + | Op::FMul(_) + | Op::FFma(_) + | Op::FSet(_) + | Op::FSetP(_) + | Op::FMnMx(_) + | Op::HAdd2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HFma2(_) + | Op::HSetP2(_) => vcoupled, + Op::DMul(_) | Op::DFma(_) | Op::DAdd(_) | Op::DSetP(_) => { + vdecoupled + } + _ => { + panic!("Illegal instuction in ureg category {}", op); + } + } + } + + fn read_after_write( + writer: URegLatencySM75, + reader: URegLatencySM75, + ) -> u32 { + use URegLatencySM75::*; + match reader { + Udp => match writer { + Udp => 4, + R2UR => 2, + Uldc | VoteU | Umov => 2, + _ => { + panic!("Illegal writer in raw ureg latency {:?}", writer) + } + }, + VectorCoupled => match writer { + Udp => 6, + R2UR => 2, + Uldc | VoteU | Umov => 2, + _ => { + panic!("Illegal writer in raw ureg latency {:?}", writer) + } + }, + VectorDecoupled => match writer { + Udp => 9, + R2UR => 2, + Uldc | VoteU | Umov => 2, + _ => { + panic!("Illegal writer in raw ureg latency {:?}", writer) + } + }, + Uldc | VectorCoupledBindless | VectorDecoupledBindless => { + match writer { + Udp => 12, + R2UR => 2, + Uldc | VoteU | Umov => 5, + _ => { + panic!( + "Illegal writer in raw ureg latency {:?}", + writer + ) + } + } + } + Umov => match writer { + Udp => 7, + R2UR => 2, + Uldc | VoteU | Umov => 2, + _ => { + panic!("Illegal writer in raw ureg latency") + } + }, + _ => { + panic!("Illegal read in ureg raw latency") + } + } + } + + fn write_after_write( + writer1: URegLatencySM75, + writer2: URegLatencySM75, + has_pred: bool, + ) -> u32 { + use URegLatencySM75::*; + match writer2 { + Udp => match writer1 { + Udp => 1, + R2UR => 2, + Uldc | VoteU | Umov => 1, + _ => { + panic!("Illegal writer in ureg waw latency") + } + }, + R2UR => match writer1 { + Udp => pred(has_pred, 4, 6), + R2UR => 2, + Uldc | VoteU | Umov => 4, + _ => { + panic!("Illegal writer in ureg waw latency") + } + }, + Uldc | VoteU | Umov => match writer1 { + Udp => 7, + R2UR => 2, + Uldc | VoteU | Umov => 1, + _ => { + panic!("Illegal writer in ureg waw latency") + } + }, + _ => { + panic!("Illegal writer in ureg waw latency") + } + } + } + + fn write_after_read( + reader: URegLatencySM75, + writer: URegLatencySM75, + ) -> u32 { + use URegLatencySM75::*; + match writer { + Udp => 1, + R2UR => 1, + Uldc | VoteU | Umov => match reader { + Udp => 3, + _ => 1, + }, + _ => { + panic!("Illegal writer in ureg war latency") + } + } + } + + fn pred_read_after_write( + writer: URegLatencySM75, + reader: URegLatencySM75, + ) -> u32 { + use URegLatencySM75::*; + match reader { + Udp => match writer { + Udp => 4, + VoteU => 1, + _ => { + panic!("Illegal writer in upred raw latency") + } + }, + VectorCoupled => match writer { + Udp => 6, + VoteU => 1, + _ => { + panic!("Illegal writer in upred raw latency") + } + }, + GuardPredicate => match writer { + Udp => 11, + VoteU => 5, + _ => { + panic!("Illegal writer in upred raw latency") + } + }, + _ => { + panic!("Illegal reader in upred raw latency") + } + } + } + + fn pred_write_after_write( + writer1: URegLatencySM75, + writer2: URegLatencySM75, + ) -> u32 { + use URegLatencySM75::*; + match writer2 { + Udp => 1, + VoteU => match writer1 { + Udp => 7, + VoteU => 1, + _ => { + panic!("Illegal writer1 in upred raw latency") + } + }, + _ => { + panic!("Illegal writer2 in upred raw latency") + } + } + } + + fn pred_write_after_read( + reader: URegLatencySM75, + writer: URegLatencySM75, + ) -> u32 { + use URegLatencySM75::*; + match writer { + Udp => 1, + VoteU => match reader { + Udp => 2, + _ => 1, + }, + _ => { + panic!("Illegal writer2 in upred raw latency") + } + } + } +} + +pub struct SM75Latency {} + +impl SM75Latency { + pub fn needs_scoreboards(op: &Op) -> bool { + if op.is_uniform() { + match URegLatencySM75::op_category(op, false, 0) { + URegLatencySM75::R2UR => true, + _ => false, + } + } else { + match RegLatencySM75::op_category(op, false, 0) { + RegLatencySM75::RedirectedFP64 | + // We don't think fp16 needs scoreboarding on any known hw + // Put this back if we figure out it does. + //RegLatencySM75::RedirectedFP16 | + RegLatencySM75::RedirectedHMMA_884_F16 | + RegLatencySM75::RedirectedHMMA_884_F32 | + RegLatencySM75::RedirectedHMMA_1688 | + RegLatencySM75::RedirectedHMMA_16816 | + RegLatencySM75::IMMA | + RegLatencySM75::Decoupled => true, + _ => false + } + } + } + + /// if read is None pick the worst case raw latency + pub fn raw( + write: &Op, + dst_idx: usize, + read: Option<&Op>, + src_idx: usize, + ) -> u32 { + let dst_file = match write.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write_latency = + RegLatencySM75::op_category(write, false, dst_idx); + let read_latency = match read { + Some(op) => RegLatencySM75::op_category(op, true, src_idx), + None => RegLatencySM75::RedirectedFP64, + }; + return RegLatencySM75::read_after_write( + write_latency, + read_latency, + ); + } + RegFile::UGPR => { + let write_latency = + URegLatencySM75::op_category(write, false, dst_idx); + let read_latency = match read { + Some(op) => URegLatencySM75::op_category(op, true, src_idx), + None => URegLatencySM75::Uldc, + }; + return URegLatencySM75::read_after_write( + write_latency, + read_latency, + ); + } + RegFile::Pred => { + let write_latency = + RegLatencySM75::op_category(write, false, dst_idx); + let read_latency = match read { + Some(op) => RegLatencySM75::op_category(op, true, src_idx), + None => RegLatencySM75::GuardPredicate, + }; + return RegLatencySM75::pred_read_after_write( + write_latency, + read_latency, + ); + } + RegFile::UPred => { + let write_latency = + URegLatencySM75::op_category(write, false, dst_idx); + let read_latency = match read { + Some(op) => URegLatencySM75::op_category(op, true, src_idx), + None => URegLatencySM75::GuardPredicate, + }; + return URegLatencySM75::pred_read_after_write( + write_latency, + read_latency, + ); + } + RegFile::Bar => { + return 0; + } // Barriers have a HW scoreboard + _ => panic!("Not a register"), + } + } + + pub fn war(read: &Op, src_idx: usize, write: &Op, dst_idx: usize) -> u32 { + let dst_file = match write.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write_latency = + RegLatencySM75::op_category(write, false, dst_idx); + let read_latency = + RegLatencySM75::op_category(read, true, src_idx); + return RegLatencySM75::write_after_read( + read_latency, + write_latency, + ); + } + RegFile::UGPR => { + let write_latency = + URegLatencySM75::op_category(write, false, dst_idx); + let read_latency = + URegLatencySM75::op_category(read, true, src_idx); + return URegLatencySM75::write_after_read( + read_latency, + write_latency, + ); + } + RegFile::Pred => { + let write_latency = + RegLatencySM75::op_category(write, false, dst_idx); + let read_latency = + RegLatencySM75::op_category(read, true, src_idx); + return RegLatencySM75::pred_write_after_read( + read_latency, + write_latency, + ); + } + RegFile::UPred => { + let write_latency = + URegLatencySM75::op_category(write, false, dst_idx); + let read_latency = + URegLatencySM75::op_category(read, true, src_idx); + return URegLatencySM75::pred_write_after_read( + read_latency, + write_latency, + ); + } + _ => panic!("Not a register"), + } + } + + pub fn waw( + a: &Op, + a_dst_idx: usize, + b: &Op, + b_dst_idx: usize, + a_op_pred: bool, + ) -> u32 { + let dst_file = match a.dsts_as_slice()[a_dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write1_latency = + RegLatencySM75::op_category(a, false, a_dst_idx); + let write2_latency = + RegLatencySM75::op_category(b, false, b_dst_idx); + return RegLatencySM75::write_after_write( + write1_latency, + write2_latency, + a_op_pred, + ); + } + RegFile::UGPR => { + let write1_latency = + URegLatencySM75::op_category(a, false, a_dst_idx); + let write2_latency = + URegLatencySM75::op_category(b, false, b_dst_idx); + return URegLatencySM75::write_after_write( + write1_latency, + write2_latency, + a_op_pred, + ); + } + RegFile::Pred => { + let write1_latency = + RegLatencySM75::op_category(a, false, a_dst_idx); + let write2_latency = + RegLatencySM75::op_category(b, false, b_dst_idx); + return RegLatencySM75::pred_write_after_write( + write1_latency, + write2_latency, + a_op_pred, + ); + } + RegFile::UPred => { + let write1_latency = + URegLatencySM75::op_category(a, false, a_dst_idx); + let write2_latency = + URegLatencySM75::op_category(b, false, b_dst_idx); + return URegLatencySM75::pred_write_after_write( + write1_latency, + write2_latency, + ); + } + _ => panic!("Not a register"), + } + } +}