diff --git a/src/nouveau/compiler/nak/lib.rs b/src/nouveau/compiler/nak/lib.rs index 94598ca19f0..fb6517bb061 100644 --- a/src/nouveau/compiler/nak/lib.rs +++ b/src/nouveau/compiler/nak/lib.rs @@ -28,6 +28,7 @@ mod qmd; mod reg_tracker; mod repair_ssa; mod sm20; +mod sm30_instr_latencies; mod sm32; mod sm50; mod sm70; diff --git a/src/nouveau/compiler/nak/sm20.rs b/src/nouveau/compiler/nak/sm20.rs index f5bf5540fe5..c4bdac3257f 100644 --- a/src/nouveau/compiler/nak/sm20.rs +++ b/src/nouveau/compiler/nak/sm20.rs @@ -5,6 +5,10 @@ use crate::ir::*; use crate::legalize::{ src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder, }; +use crate::sm30_instr_latencies::{ + encode_kepler_shader, instr_exec_latency, instr_latency, + KeplerInstructionEncoder, +}; use bitview::*; use rustc_hash::FxHashMap; @@ -57,19 +61,18 @@ impl ShaderModel for ShaderModel20 { false } - fn exec_latency(&self, _op: &Op) -> u32 { - 1 + fn exec_latency(&self, op: &Op) -> u32 { + instr_exec_latency(self.sm, op) } fn raw_latency( &self, - _write: &Op, - _dst_idx: usize, + write: &Op, + dst_idx: usize, _read: &Op, _src_idx: usize, ) -> u32 { - // TODO - 13 + instr_latency(self.sm, write, dst_idx) } fn war_latency( @@ -79,7 +82,6 @@ impl ShaderModel for ShaderModel20 { _write: &Op, _dst_idx: usize, ) -> u32 { - // TODO // We assume the source gets read in the first 4 cycles. We don't know // how quickly the write will happen. This is all a guess. 4 @@ -87,27 +89,23 @@ impl ShaderModel for ShaderModel20 { fn waw_latency( &self, - _a: &Op, - _a_dst_idx: usize, + a: &Op, + a_dst_idx: usize, _a_has_pred: bool, _b: &Op, _b_dst_idx: usize, ) -> u32 { // We know our latencies are wrong so assume the wrote could happen // anywhere between 0 and instr_latency(a) cycles - - // TODO - 13 + instr_latency(self.sm, a, a_dst_idx) } fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 { - // TODO 13 } - fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 { - // TODO - 15 + fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { + instr_latency(self.sm, write, dst_idx) } fn max_instr_delay(&self) -> u8 { @@ -119,7 +117,12 @@ impl ShaderModel for ShaderModel20 { } fn encode_shader(&self, s: &Shader<'_>) -> Vec { - encode_sm20_shader(self, s) + if self.sm >= 30 { + // Kepler adds explicit instruction latency encodings + encode_sm30_shader(self, s) + } else { + encode_sm20_shader(self, s) + } } } @@ -3081,3 +3084,38 @@ fn encode_sm20_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec { encoded } + +impl KeplerInstructionEncoder for ShaderModel20 { + fn encode_instr( + &self, + instr: &Instr, + labels: &FxHashMap, + encoded: &mut Vec, + ) { + let mut e = SM20Encoder { + sm: self, + ip: encoded.len() * 4, + labels, + inst: [0_u32; 2], + }; + as_sm20_op(&instr.op).encode(&mut e); + e.set_pred(&instr.pred); + encoded.extend(&e.inst[..]); + } + + fn prepare_sched_instr<'a>( + &self, + sched_instr: &'a mut [u32; 2], + ) -> impl BitMutViewable + 'a { + let mut bv = BitMutView::new(sched_instr); + bv.set_field(0..4, 0b0111); + bv.set_field(60..64, 0b0010); + + BitMutView::new_subset(sched_instr, 4..60) + } +} + +fn encode_sm30_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec { + assert!(sm.sm >= 30); + encode_kepler_shader(sm, s) +} diff --git a/src/nouveau/compiler/nak/sm30_instr_latencies.rs b/src/nouveau/compiler/nak/sm30_instr_latencies.rs new file mode 100644 index 00000000000..011da013382 --- /dev/null +++ b/src/nouveau/compiler/nak/sm30_instr_latencies.rs @@ -0,0 +1,161 @@ +use bitview::{BitMutViewable, BitViewable, SetField}; +use rustc_hash::FxHashMap; + +use crate::ir::{Instr, InstrDeps, Label, Op, OpNop, Shader}; + +pub fn instr_latency(_sm: u8, op: &Op, _dst_idx: usize) -> u32 { + if op.is_fp64() { + return 20; + } + + match op { + Op::Ipa(_) => 15, + Op::Ld(_) => 24, + Op::ALd(_) => 24, + Op::IMul(_) => 15, // This does not apply to imad, right? right??? + Op::Tex(_) + | Op::Tld(_) + | Op::Tld4(_) + | Op::Tmml(_) + | Op::Txd(_) + | Op::Txq(_) => 17, + _ => 9, + } +} + +pub fn instr_exec_latency(_sm: u8, op: &Op) -> u32 { + match op { + Op::Tex(_) + | Op::Tld(_) + | Op::Tld4(_) + | Op::Tmml(_) + | Op::Txd(_) + | Op::Txq(_) => 17, + Op::Exit(_) => 15, + _ => 1, + } +} + +fn calc_instr_sched(prev_op: Option<&Op>, op: &Op, deps: &InstrDeps) -> u8 { + // Kepler is the first generation to lift scoreboarding from the + // hardware into the compiler. For each instruction we encode + // the delay but not all the other information necessary for newer + // architectures. + // The hardware still checks for data-hazard and, if present, it + // will delay the instruction by 32 cycles. + match op { + Op::TexDepBar(_) => 0xc2, + Op::Sync(_) => 0x00, // Wait 16 cycles + _ => { + // TODO: when we support dual-issue this should check for + // both previous ops + let base = match prev_op { + Some(Op::ASt(_)) => 0x40, + _ => 0x20, + }; + + let delay = deps.delay; + debug_assert!(delay >= 1 && delay <= 32); + base | (delay - 1) + } + } + + // 0x00: wait for 16 cycles + // 0x04: dual-issue with next instruction + // 0xc2: if TEXBAR + // 0x20 | 0x40: suspend for N+1 cycles (N = bitmask 0x1f) + // 0x40 only if prev_op is attribute store + // Unsure: + // 0x80: global memory bit + // + // TODO: + // - Dual issue (0x04) + // - Functional Unit tracking +} + +pub trait KeplerInstructionEncoder { + /// Encode the instruction and push it into the "encoded" vec + fn encode_instr( + &self, + instr: &Instr, + labels: &FxHashMap, + encoded: &mut Vec, + ); + + /// Prepare the scheduling instruction opcode-field and return a + /// subset where the actual scheduling information will be written + fn prepare_sched_instr<'a>( + &self, + sched_instr: &'a mut [u32; 2], + ) -> impl BitMutViewable + 'a; +} + +/// Helper function that encodes shaders for both KeplerA and KeplerB. +/// Difference in the encoders are handled by KeplerInstructionEncoder. +pub fn encode_kepler_shader(encoder: &E, s: &Shader<'_>) -> Vec +where + E: KeplerInstructionEncoder, +{ + const INSTR_LEN_BYTES: usize = 8; + assert!(s.functions.len() == 1); + let func = &s.functions[0]; + + // --- Compute label addresses --- + // We need a schedule instruction every 7 instructions, these don't + // define jump boundaries so we can have multible blocks in the same + // 7-instr group. + let mut ip = 0_usize; + let mut labels = FxHashMap::default(); + for b in &func.blocks { + let num_sched = (ip / 7) + 1; + labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES); + ip += b.instrs.len(); + } + + // --- Real encoding --- + // Create an instruction iterator and iterate it in chunks of 7. + // fill the last chunk with a nop (it should never be executed). + let mut instr_iter = func + .blocks + .iter() + .flat_map(|b| b.instrs.iter().map(|x| &**x)) + .peekable(); + let mut filling_instr = Instr { + pred: true.into(), + op: Op::Nop(OpNop { label: None }), + deps: InstrDeps::new(), + }; + filling_instr.deps.set_delay(1); + let mut sched_chunk_gen = || { + if instr_iter.peek().is_none() { + return None; + } + Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr))) + }; + + let mut encoded = Vec::new(); + let mut prev_op = None; + while let Some(sched_chunk) = sched_chunk_gen() { + let sched_i = encoded.len(); + + let mut sched_instr = [0u32; 2]; + encoded.extend(&sched_instr[..]); // Push now, will edit later + let mut bv = encoder.prepare_sched_instr(&mut sched_instr); + // There should be 8 bits for each instr in a scheduling block + debug_assert!(bv.bits() == 8 * 7); + + for (i, instr) in sched_chunk.iter().enumerate() { + encoder.encode_instr(&instr, &labels, &mut encoded); + + let sched = calc_instr_sched(prev_op, &instr.op, &instr.deps); + bv.set_field(i * 8..(i + 1) * 8, sched); + prev_op = Some(&instr.op); + } + + drop(bv); + encoded[sched_i] = sched_instr[0]; + encoded[sched_i + 1] = sched_instr[1]; + } + + encoded +} diff --git a/src/nouveau/compiler/nak/sm32.rs b/src/nouveau/compiler/nak/sm32.rs index 35f56ca5d6d..4b1e64e9255 100644 --- a/src/nouveau/compiler/nak/sm32.rs +++ b/src/nouveau/compiler/nak/sm32.rs @@ -6,6 +6,10 @@ use crate::legalize::{ src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder, PadValue, }; +use crate::sm30_instr_latencies::{ + encode_kepler_shader, instr_exec_latency, instr_latency, + KeplerInstructionEncoder, +}; use bitview::{ BitMutView, BitMutViewable, BitView, BitViewable, SetBit, SetField, }; @@ -60,34 +64,17 @@ impl ShaderModel for ShaderModel32 { } fn exec_latency(&self, op: &Op) -> u32 { - // TODO - match op { - Op::CCtl(_) - | Op::MemBar(_) - | Op::Bra(_) - | Op::SSy(_) - | Op::Sync(_) - | Op::Brk(_) - | Op::PBk(_) - | Op::Cont(_) - | Op::PCnt(_) - | Op::Exit(_) - | Op::Bar(_) - | Op::Kill(_) - | Op::OutFinal(_) => 13, - _ => 1, - } + instr_exec_latency(self.sm, op) } fn raw_latency( &self, - _write: &Op, - _dst_idx: usize, + write: &Op, + dst_idx: usize, _read: &Op, _src_idx: usize, ) -> u32 { - // TODO - 13 + instr_latency(self.sm, write, dst_idx) } fn war_latency( @@ -97,7 +84,6 @@ impl ShaderModel for ShaderModel32 { _write: &Op, _dst_idx: usize, ) -> u32 { - // TODO // We assume the source gets read in the first 4 cycles. We don't know // how quickly the write will happen. This is all a guess. 4 @@ -105,27 +91,23 @@ impl ShaderModel for ShaderModel32 { fn waw_latency( &self, - _a: &Op, - _a_dst_idx: usize, + a: &Op, + a_dst_idx: usize, _a_has_pred: bool, _b: &Op, _b_dst_idx: usize, ) -> u32 { // We know our latencies are wrong so assume the wrote could happen // anywhere between 0 and instr_latency(a) cycles - - // TODO - 13 + instr_latency(self.sm, a, a_dst_idx) } fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 { - // TODO 13 } - fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 { - // TODO - 15 + fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { + instr_latency(self.sm, write, dst_idx) } fn max_instr_delay(&self) -> u8 { @@ -161,7 +143,6 @@ struct SM32Encoder<'a> { ip: usize, labels: &'a FxHashMap, inst: [u32; 2], - sched: u8, } impl BitViewable for SM32Encoder<'_> { @@ -323,23 +304,6 @@ impl SM32Encoder<'_> { }, ); } - - fn set_instr_dependency(&mut self, _deps: &InstrDeps) { - // TODO: schedulng - //let mut sched = BitMutView::new(&mut self.sched); - //sched.set_field(0..5, deps.delay); - self.sched = 0x00; - // 0x00: wait for 32 cycles - // 0x04: dual-issue with next instruction - // 0xc2 if TEXBAR - // 0x40 if EXPORT - // 0x20 otherwise(?) - - // 0x80: global memory bit - // 0x40: EXPORT(?) - // 0x20: suspend for N cycles (N = bitmask 0x1f) - // 0x10: shared memory? - } } // @@ -3409,86 +3373,36 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op { as_sm50_op_match!(op) } -fn encode_instr( - instr: &Instr, - sm: &ShaderModel32, - labels: &FxHashMap, - encoded: &mut Vec, -) -> u8 { - let mut e = SM32Encoder { - sm: sm, - ip: encoded.len() * 4, - labels, - inst: [0_u32; 2], - sched: 0, - }; +impl KeplerInstructionEncoder for ShaderModel32 { + fn encode_instr( + &self, + instr: &Instr, + labels: &FxHashMap, + encoded: &mut Vec, + ) { + let mut e = SM32Encoder { + sm: self, + ip: encoded.len() * 4, + labels, + inst: [0_u32; 2], + }; + as_sm32_op(&instr.op).encode(&mut e); + e.set_pred(&instr.pred); + encoded.extend(&e.inst[..]); + } - as_sm32_op(&instr.op).encode(&mut e); - e.set_pred(&instr.pred); - e.set_instr_dependency(&instr.deps); + fn prepare_sched_instr<'a>( + &self, + sched_instr: &'a mut [u32; 2], + ) -> impl BitMutViewable + 'a { + let mut bv = BitMutView::new(sched_instr); + bv.set_field(0..2, 0b00); + bv.set_field(58..64, 0b000010); // 0x08 - encoded.extend(&e.inst[..]); - - e.sched + BitMutView::new_subset(sched_instr, 2..58) + } } fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec { - const INSTR_LEN_BYTES: usize = 8; - assert!(s.functions.len() == 1); - let func = &s.functions[0]; - - // --- Compute label addresses --- - // We need a schedule instruction every 7 instructions, these don't - // define jump boundaries so we can have multible blocks in the same - // 7-instr group. - let mut ip = 0_usize; - let mut labels = FxHashMap::default(); - for b in &func.blocks { - let num_sched = (ip / 7) + 1; - labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES); - ip += b.instrs.len(); - } - - // --- Real encoding --- - // Create an instruction iterator and iterate it in chunks of 7. - // fill the last chunk with a nop (it should never be executed). - let mut instr_iter = func - .blocks - .iter() - .flat_map(|b| b.instrs.iter().map(|x| &**x)) - .peekable(); - let mut filling_instr = Instr { - pred: true.into(), - op: Op::Nop(OpNop { label: None }), - deps: InstrDeps::new(), - }; - filling_instr.deps.set_delay(1); - let mut sched_chunk_gen = || { - if instr_iter.peek().is_none() { - return None; - } - Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr))) - }; - - let mut encoded = Vec::new(); - while let Some(sched_chunk) = sched_chunk_gen() { - let sched_i = encoded.len(); - - let mut sched_instr = [0u32; 2]; - encoded.extend(&sched_instr[..]); // Push now, will edit later - let mut bv = BitMutView::new(&mut sched_instr); - bv.set_field(0..2, 0b00); - bv.set_field(58..64, 0b000010); // 0x80 - let mut bv = bv.subset_mut(2..58); - - for (i, instr) in sched_chunk.iter().enumerate() { - let sched = encode_instr(instr, sm, &labels, &mut encoded); - - bv.set_field(i * 8..(i + 1) * 8, sched); - } - encoded[sched_i] = sched_instr[0]; - encoded[sched_i + 1] = sched_instr[1]; - } - - encoded + encode_kepler_shader(sm, s) }