nak: Add real instruction dependencies for Kepler

This commit ports instruction latency information found in codegen emitter. Previously every instruction was delayed by 16 cycles even if it was not necessary. PixMark Piano is highly affected by instruction latencies and gets a 2.5x boost, other benchmarks still get better performance. The other two missing pieces to get feature parity with codegen are functional unit resource tracking and instruction dual-issue. Performance measures on a GT770 (with 0f pstate) Pixmark piano: 519 -> 14526 pts (has rendering issues in both!) Furmark: 3247 -> 5786 pts The talos principle (high settings): 30-33 -> 55-60 FPS Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35821>
2025-12-30 10:00:14 +01:00 · 2025-07-09 23:49:53 +02:00 · 2025-07-09 23:49:53 +02:00 · c35990c4bc
commit c35990c4bc
parent ba42b916ac
4 changed files with 257 additions and 143 deletions
--- a/src/nouveau/compiler/nak/lib.rs
+++ b/src/nouveau/compiler/nak/lib.rs
@ -28,6 +28,7 @@ mod qmd;
 mod reg_tracker;
 mod repair_ssa;
 mod sm20;
+mod sm30_instr_latencies;
 mod sm32;
 mod sm50;
 mod sm70;
--- a/src/nouveau/compiler/nak/sm20.rs
+++ b/src/nouveau/compiler/nak/sm20.rs
@ -5,6 +5,10 @@ use crate::ir::*;
 use crate::legalize::{
    src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder,
 };
+use crate::sm30_instr_latencies::{
+    encode_kepler_shader, instr_exec_latency, instr_latency,
+    KeplerInstructionEncoder,
+};
 use bitview::*;

 use rustc_hash::FxHashMap;
@ -57,19 +61,18 @@ impl ShaderModel for ShaderModel20 {
        false
    }

-    fn exec_latency(&self, _op: &Op) -> u32 {
-        1
+    fn exec_latency(&self, op: &Op) -> u32 {
+        instr_exec_latency(self.sm, op)
    }

    fn raw_latency(
        &self,
-        _write: &Op,
-        _dst_idx: usize,
+        write: &Op,
+        dst_idx: usize,
        _read: &Op,
        _src_idx: usize,
    ) -> u32 {
-        // TODO
-        13
+        instr_latency(self.sm, write, dst_idx)
    }

    fn war_latency(
@ -79,7 +82,6 @@ impl ShaderModel for ShaderModel20 {
        _write: &Op,
        _dst_idx: usize,
    ) -> u32 {
-        // TODO
        // We assume the source gets read in the first 4 cycles.  We don't know
        // how quickly the write will happen.  This is all a guess.
        4
@ -87,27 +89,23 @@ impl ShaderModel for ShaderModel20 {

    fn waw_latency(
        &self,
-        _a: &Op,
-        _a_dst_idx: usize,
+        a: &Op,
+        a_dst_idx: usize,
        _a_has_pred: bool,
        _b: &Op,
        _b_dst_idx: usize,
    ) -> u32 {
        // We know our latencies are wrong so assume the wrote could happen
        // anywhere between 0 and instr_latency(a) cycles
-
-        // TODO
-        13
+        instr_latency(self.sm, a, a_dst_idx)
    }

    fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
-        // TODO
        13
    }

-    fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
-        // TODO
-        15
+    fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
+        instr_latency(self.sm, write, dst_idx)
    }

    fn max_instr_delay(&self) -> u8 {
@ -119,7 +117,12 @@ impl ShaderModel for ShaderModel20 {
    }

    fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
-        encode_sm20_shader(self, s)
+        if self.sm >= 30 {
+            // Kepler adds explicit instruction latency encodings
+            encode_sm30_shader(self, s)
+        } else {
+            encode_sm20_shader(self, s)
+        }
    }
 }

@ -3081,3 +3084,38 @@ fn encode_sm20_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec<u32> {

    encoded
 }
+
+impl KeplerInstructionEncoder for ShaderModel20 {
+    fn encode_instr(
+        &self,
+        instr: &Instr,
+        labels: &FxHashMap<Label, usize>,
+        encoded: &mut Vec<u32>,
+    ) {
+        let mut e = SM20Encoder {
+            sm: self,
+            ip: encoded.len() * 4,
+            labels,
+            inst: [0_u32; 2],
+        };
+        as_sm20_op(&instr.op).encode(&mut e);
+        e.set_pred(&instr.pred);
+        encoded.extend(&e.inst[..]);
+    }
+
+    fn prepare_sched_instr<'a>(
+        &self,
+        sched_instr: &'a mut [u32; 2],
+    ) -> impl BitMutViewable + 'a {
+        let mut bv = BitMutView::new(sched_instr);
+        bv.set_field(0..4, 0b0111);
+        bv.set_field(60..64, 0b0010);
+
+        BitMutView::new_subset(sched_instr, 4..60)
+    }
+}
+
+fn encode_sm30_shader(sm: &ShaderModel20, s: &Shader<'_>) -> Vec<u32> {
+    assert!(sm.sm >= 30);
+    encode_kepler_shader(sm, s)
+}
--- a/src/nouveau/compiler/nak/sm30_instr_latencies.rs
+++ b/src/nouveau/compiler/nak/sm30_instr_latencies.rs
@ -0,0 +1,161 @@
+use bitview::{BitMutViewable, BitViewable, SetField};
+use rustc_hash::FxHashMap;
+
+use crate::ir::{Instr, InstrDeps, Label, Op, OpNop, Shader};
+
+pub fn instr_latency(_sm: u8, op: &Op, _dst_idx: usize) -> u32 {
+    if op.is_fp64() {
+        return 20;
+    }
+
+    match op {
+        Op::Ipa(_) => 15,
+        Op::Ld(_) => 24,
+        Op::ALd(_) => 24,
+        Op::IMul(_) => 15, // This does not apply to imad, right? right???
+        Op::Tex(_)
+        | Op::Tld(_)
+        | Op::Tld4(_)
+        | Op::Tmml(_)
+        | Op::Txd(_)
+        | Op::Txq(_) => 17,
+        _ => 9,
+    }
+}
+
+pub fn instr_exec_latency(_sm: u8, op: &Op) -> u32 {
+    match op {
+        Op::Tex(_)
+        | Op::Tld(_)
+        | Op::Tld4(_)
+        | Op::Tmml(_)
+        | Op::Txd(_)
+        | Op::Txq(_) => 17,
+        Op::Exit(_) => 15,
+        _ => 1,
+    }
+}
+
+fn calc_instr_sched(prev_op: Option<&Op>, op: &Op, deps: &InstrDeps) -> u8 {
+    // Kepler is the first generation to lift scoreboarding from the
+    // hardware into the compiler. For each instruction we encode
+    // the delay but not all the other information necessary for newer
+    // architectures.
+    // The hardware still checks for data-hazard and, if present, it
+    // will delay the instruction by 32 cycles.
+    match op {
+        Op::TexDepBar(_) => 0xc2,
+        Op::Sync(_) => 0x00, // Wait 16 cycles
+        _ => {
+            // TODO: when we support dual-issue this should check for
+            // both previous ops
+            let base = match prev_op {
+                Some(Op::ASt(_)) => 0x40,
+                _ => 0x20,
+            };
+
+            let delay = deps.delay;
+            debug_assert!(delay >= 1 && delay <= 32);
+            base | (delay - 1)
+        }
+    }
+
+    // 0x00: wait for 16 cycles
+    // 0x04: dual-issue with next instruction
+    // 0xc2: if TEXBAR
+    // 0x20 | 0x40: suspend for N+1 cycles (N = bitmask 0x1f)
+    //              0x40 only if prev_op is attribute store
+    // Unsure:
+    // 0x80: global memory bit
+    //
+    // TODO:
+    // - Dual issue (0x04)
+    // - Functional Unit tracking
+}
+
+pub trait KeplerInstructionEncoder {
+    /// Encode the instruction and push it into the "encoded" vec
+    fn encode_instr(
+        &self,
+        instr: &Instr,
+        labels: &FxHashMap<Label, usize>,
+        encoded: &mut Vec<u32>,
+    );
+
+    /// Prepare the scheduling instruction opcode-field and return a
+    /// subset where the actual scheduling information will be written
+    fn prepare_sched_instr<'a>(
+        &self,
+        sched_instr: &'a mut [u32; 2],
+    ) -> impl BitMutViewable + 'a;
+}
+
+/// Helper function that encodes shaders for both KeplerA and KeplerB.
+/// Difference in the encoders are handled by KeplerInstructionEncoder.
+pub fn encode_kepler_shader<E>(encoder: &E, s: &Shader<'_>) -> Vec<u32>
+where
+    E: KeplerInstructionEncoder,
+{
+    const INSTR_LEN_BYTES: usize = 8;
+    assert!(s.functions.len() == 1);
+    let func = &s.functions[0];
+
+    // --- Compute label addresses ---
+    // We need a schedule instruction every 7 instructions, these don't
+    // define jump boundaries so we can have multible blocks in the same
+    // 7-instr group.
+    let mut ip = 0_usize;
+    let mut labels = FxHashMap::default();
+    for b in &func.blocks {
+        let num_sched = (ip / 7) + 1;
+        labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
+        ip += b.instrs.len();
+    }
+
+    // --- Real encoding ---
+    // Create an instruction iterator and iterate it in chunks of 7.
+    // fill the last chunk with a nop (it should never be executed).
+    let mut instr_iter = func
+        .blocks
+        .iter()
+        .flat_map(|b| b.instrs.iter().map(|x| &**x))
+        .peekable();
+    let mut filling_instr = Instr {
+        pred: true.into(),
+        op: Op::Nop(OpNop { label: None }),
+        deps: InstrDeps::new(),
+    };
+    filling_instr.deps.set_delay(1);
+    let mut sched_chunk_gen = || {
+        if instr_iter.peek().is_none() {
+            return None;
+        }
+        Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
+    };
+
+    let mut encoded = Vec::new();
+    let mut prev_op = None;
+    while let Some(sched_chunk) = sched_chunk_gen() {
+        let sched_i = encoded.len();
+
+        let mut sched_instr = [0u32; 2];
+        encoded.extend(&sched_instr[..]); // Push now, will edit later
+        let mut bv = encoder.prepare_sched_instr(&mut sched_instr);
+        // There should be 8 bits for each instr in a scheduling block
+        debug_assert!(bv.bits() == 8 * 7);
+
+        for (i, instr) in sched_chunk.iter().enumerate() {
+            encoder.encode_instr(&instr, &labels, &mut encoded);
+
+            let sched = calc_instr_sched(prev_op, &instr.op, &instr.deps);
+            bv.set_field(i * 8..(i + 1) * 8, sched);
+            prev_op = Some(&instr.op);
+        }
+
+        drop(bv);
+        encoded[sched_i] = sched_instr[0];
+        encoded[sched_i + 1] = sched_instr[1];
+    }
+
+    encoded
+}
--- a/src/nouveau/compiler/nak/sm32.rs
+++ b/src/nouveau/compiler/nak/sm32.rs
@ -6,6 +6,10 @@ use crate::legalize::{
    src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder,
    PadValue,
 };
+use crate::sm30_instr_latencies::{
+    encode_kepler_shader, instr_exec_latency, instr_latency,
+    KeplerInstructionEncoder,
+};
 use bitview::{
    BitMutView, BitMutViewable, BitView, BitViewable, SetBit, SetField,
 };
@ -60,34 +64,17 @@ impl ShaderModel for ShaderModel32 {
    }

    fn exec_latency(&self, op: &Op) -> u32 {
-        // TODO
-        match op {
-            Op::CCtl(_)
-            | Op::MemBar(_)
-            | Op::Bra(_)
-            | Op::SSy(_)
-            | Op::Sync(_)
-            | Op::Brk(_)
-            | Op::PBk(_)
-            | Op::Cont(_)
-            | Op::PCnt(_)
-            | Op::Exit(_)
-            | Op::Bar(_)
-            | Op::Kill(_)
-            | Op::OutFinal(_) => 13,
-            _ => 1,
-        }
+        instr_exec_latency(self.sm, op)
    }

    fn raw_latency(
        &self,
-        _write: &Op,
-        _dst_idx: usize,
+        write: &Op,
+        dst_idx: usize,
        _read: &Op,
        _src_idx: usize,
    ) -> u32 {
-        // TODO
-        13
+        instr_latency(self.sm, write, dst_idx)
    }

    fn war_latency(
@ -97,7 +84,6 @@ impl ShaderModel for ShaderModel32 {
        _write: &Op,
        _dst_idx: usize,
    ) -> u32 {
-        // TODO
        // We assume the source gets read in the first 4 cycles.  We don't know
        // how quickly the write will happen.  This is all a guess.
        4
@ -105,27 +91,23 @@ impl ShaderModel for ShaderModel32 {

    fn waw_latency(
        &self,
-        _a: &Op,
-        _a_dst_idx: usize,
+        a: &Op,
+        a_dst_idx: usize,
        _a_has_pred: bool,
        _b: &Op,
        _b_dst_idx: usize,
    ) -> u32 {
        // We know our latencies are wrong so assume the wrote could happen
        // anywhere between 0 and instr_latency(a) cycles
-
-        // TODO
-        13
+        instr_latency(self.sm, a, a_dst_idx)
    }

    fn paw_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
-        // TODO
        13
    }

-    fn worst_latency(&self, _write: &Op, _dst_idx: usize) -> u32 {
-        // TODO
-        15
+    fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
+        instr_latency(self.sm, write, dst_idx)
    }

    fn max_instr_delay(&self) -> u8 {
@ -161,7 +143,6 @@ struct SM32Encoder<'a> {
    ip: usize,
    labels: &'a FxHashMap<Label, usize>,
    inst: [u32; 2],
-    sched: u8,
 }

 impl BitViewable for SM32Encoder<'_> {
@ -323,23 +304,6 @@ impl SM32Encoder<'_> {
            },
        );
    }
-
-    fn set_instr_dependency(&mut self, _deps: &InstrDeps) {
-        // TODO: schedulng
-        //let mut sched = BitMutView::new(&mut self.sched);
-        //sched.set_field(0..5, deps.delay);
-        self.sched = 0x00;
-        // 0x00: wait for 32 cycles
-        // 0x04: dual-issue with next instruction
-        // 0xc2 if TEXBAR
-        // 0x40 if EXPORT
-        // 0x20 otherwise(?)
-
-        // 0x80: global memory bit
-        // 0x40: EXPORT(?)
-        // 0x20: suspend for N cycles (N = bitmask 0x1f)
-        // 0x10: shared memory?
-    }
 }

 //
@ -3409,86 +3373,36 @@ fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op {
    as_sm50_op_match!(op)
 }

-fn encode_instr(
-    instr: &Instr,
-    sm: &ShaderModel32,
-    labels: &FxHashMap<Label, usize>,
-    encoded: &mut Vec<u32>,
-) -> u8 {
-    let mut e = SM32Encoder {
-        sm: sm,
-        ip: encoded.len() * 4,
-        labels,
-        inst: [0_u32; 2],
-        sched: 0,
-    };
+impl KeplerInstructionEncoder for ShaderModel32 {
+    fn encode_instr(
+        &self,
+        instr: &Instr,
+        labels: &FxHashMap<Label, usize>,
+        encoded: &mut Vec<u32>,
+    ) {
+        let mut e = SM32Encoder {
+            sm: self,
+            ip: encoded.len() * 4,
+            labels,
+            inst: [0_u32; 2],
+        };
+        as_sm32_op(&instr.op).encode(&mut e);
+        e.set_pred(&instr.pred);
+        encoded.extend(&e.inst[..]);
+    }

-    as_sm32_op(&instr.op).encode(&mut e);
-    e.set_pred(&instr.pred);
-    e.set_instr_dependency(&instr.deps);
+    fn prepare_sched_instr<'a>(
+        &self,
+        sched_instr: &'a mut [u32; 2],
+    ) -> impl BitMutViewable + 'a {
+        let mut bv = BitMutView::new(sched_instr);
+        bv.set_field(0..2, 0b00);
+        bv.set_field(58..64, 0b000010); // 0x08

-    encoded.extend(&e.inst[..]);
-
-    e.sched
+        BitMutView::new_subset(sched_instr, 2..58)
+    }
 }

 fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec<u32> {
-    const INSTR_LEN_BYTES: usize = 8;
-    assert!(s.functions.len() == 1);
-    let func = &s.functions[0];
-
-    // --- Compute label addresses ---
-    // We need a schedule instruction every 7 instructions, these don't
-    // define jump boundaries so we can have multible blocks in the same
-    // 7-instr group.
-    let mut ip = 0_usize;
-    let mut labels = FxHashMap::default();
-    for b in &func.blocks {
-        let num_sched = (ip / 7) + 1;
-        labels.insert(b.label, (ip + num_sched) * INSTR_LEN_BYTES);
-        ip += b.instrs.len();
-    }
-
-    // --- Real encoding ---
-    // Create an instruction iterator and iterate it in chunks of 7.
-    // fill the last chunk with a nop (it should never be executed).
-    let mut instr_iter = func
-        .blocks
-        .iter()
-        .flat_map(|b| b.instrs.iter().map(|x| &**x))
-        .peekable();
-    let mut filling_instr = Instr {
-        pred: true.into(),
-        op: Op::Nop(OpNop { label: None }),
-        deps: InstrDeps::new(),
-    };
-    filling_instr.deps.set_delay(1);
-    let mut sched_chunk_gen = || {
-        if instr_iter.peek().is_none() {
-            return None;
-        }
-        Some([0; 7].map(|_| instr_iter.next().unwrap_or(&filling_instr)))
-    };
-
-    let mut encoded = Vec::new();
-    while let Some(sched_chunk) = sched_chunk_gen() {
-        let sched_i = encoded.len();
-
-        let mut sched_instr = [0u32; 2];
-        encoded.extend(&sched_instr[..]); // Push now, will edit later
-        let mut bv = BitMutView::new(&mut sched_instr);
-        bv.set_field(0..2, 0b00);
-        bv.set_field(58..64, 0b000010); // 0x80
-        let mut bv = bv.subset_mut(2..58);
-
-        for (i, instr) in sched_chunk.iter().enumerate() {
-            let sched = encode_instr(instr, sm, &labels, &mut encoded);
-
-            bv.set_field(i * 8..(i + 1) * 8, sched);
-        }
-        encoded[sched_i] = sched_instr[0];
-        encoded[sched_i + 1] = sched_instr[1];
-    }
-
-    encoded
+    encode_kepler_shader(sm, s)
 }