nak/kepler: Add texdepbar insertion pass

This commit adds a forward data-flow pass to insert texdepbar before using registers of texture fetch instructions. The new algorithm started as a port of the old codegen pass, but finished in a complete rewrite that is substantially simpler and should generate less conservative code in some edge cases. Signed-off-by: Lorenzo Rossi <snowycoder@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35403>
2026-06-06 04:08:48 +02:00 · 2025-07-15 21:57:00 +02:00 · 2025-07-15 21:57:00 +02:00 · f2e6bacafd
commit f2e6bacafd
parent fbeb70cbbc
3 changed files with 433 additions and 8 deletions
--- a/src/nouveau/compiler/nak/calc_instr_deps.rs
+++ b/src/nouveau/compiler/nak/calc_instr_deps.rs
@ -2,12 +2,14 @@
 // SPDX-License-Identifier: MIT

 use crate::api::{GetDebugFlags, DEBUG};
+use crate::dataflow::ForwardDataflow;
 use crate::ir::*;
 use crate::reg_tracker::RegTracker;

 use rustc_hash::{FxHashMap, FxHashSet};
 use std::cmp::max;
-use std::slice;
+use std::ops::Range;
+use std::{slice, u32, u8};

 #[derive(Clone)]
 enum RegUse<T: Clone> {
@ -229,6 +231,342 @@ impl BarAlloc {
    }
 }

+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+struct TexQueueSimulationEntry {
+    min_pos: u8,
+}
+
+impl TexQueueSimulationEntry {
+    const INVALID: Self = TexQueueSimulationEntry { min_pos: u8::MAX };
+
+    // First element on the queue
+    const FIRST: Self = TexQueueSimulationEntry { min_pos: 0 };
+
+    fn is_valid(&self) -> bool {
+        if *self == Self::INVALID {
+            false
+        } else {
+            debug_assert!(self.min_pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
+            true
+        }
+    }
+
+    fn push(&mut self) {
+        if self.is_valid() {
+            self.min_pos += 1;
+        }
+    }
+
+    fn flush_after(&mut self, pos: u8) -> bool {
+        if self.min_pos < pos {
+            true
+        } else {
+            // This entry is either invalid or higher than the cull level
+            *self = Self::INVALID;
+            false
+        }
+    }
+
+    fn merge(&mut self, other: &Self) {
+        self.min_pos = self.min_pos.min(other.min_pos);
+    }
+}
+
+/// Simulate the state of a register in the queue, in buckets of 4
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+struct TexQueueSimulationBucket {
+    entries: [TexQueueSimulationEntry; 4],
+}
+
+impl TexQueueSimulationBucket {
+    const EMPTY: Self = TexQueueSimulationBucket {
+        entries: [TexQueueSimulationEntry::INVALID; 4],
+    };
+
+    fn min_queue_position(&self, range: Range<usize>) -> Option<u8> {
+        self.entries[range]
+            .iter()
+            .filter(|x| x.is_valid())
+            .map(|x| x.min_pos)
+            .min()
+    }
+
+    fn set_as_first(&mut self, range: Range<usize>) {
+        for i in range {
+            debug_assert!(!self.entries[i].is_valid());
+            self.entries[i] = TexQueueSimulationEntry::FIRST;
+        }
+    }
+
+    fn push(&mut self) {
+        for entry in &mut self.entries {
+            entry.push();
+        }
+    }
+
+    fn flush_after(&mut self, pos: u8) -> bool {
+        debug_assert!(pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
+
+        let mut retain = false;
+        for x in &mut self.entries {
+            retain |= x.flush_after(pos);
+        }
+        retain
+    }
+
+    fn merge(&mut self, other: &Self) {
+        for (x, y) in self.entries.iter_mut().zip(other.entries.iter()) {
+            x.merge(y);
+        }
+    }
+}
+
+/// This state simulates the texture queue for each destination.
+///
+/// For example, at the start the queue is always empty, but if we encounter a
+/// tex operation that writes in r4..r8, that is pushed on the queue at
+/// position 0.  If we encounter another tex operation that only writes r5,
+/// that will be pushed at position 0 and the old tex instruction will be in
+/// position 1.  This data-structure keeps track of the position of the queue
+/// for each destination register present in the queue, push operations
+/// correspond to new texture instructions, while flush operations correspond to
+/// the usage of registers which may still be on the queue.
+///
+/// Since all Kepler texture operations use at most 4 registers, and many
+/// instruction use more than one destination at a time, we group registers in
+/// buckets of 4.  With this optimization each RegRef only accesses a single
+/// bucket.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TexQueueSimulationState {
+    /// Min position of the destination register in the queue,
+    /// in buckets of 4 (indexed by register_index / 4).
+    queue_pos: FxHashMap<u8, TexQueueSimulationBucket>,
+    /// Max length of the queue, needed to check for overflows
+    max_queue_len: u8,
+}
+
+impl TexQueueSimulationState {
+    pub fn new() -> Self {
+        TexQueueSimulationState {
+            queue_pos: Default::default(),
+            max_queue_len: 0,
+        }
+    }
+
+    /// Translate from RegRef to bucket_index + bucket_range
+    #[inline]
+    fn reg_ref_to_coords(reg: RegRef) -> (u8, Range<usize>) {
+        debug_assert!(reg.base_idx() <= u8::MAX.into());
+        let idx = reg.base_idx() as u8 / 4;
+        let sub = (reg.base_idx() % 4) as usize;
+
+        let range = sub..(sub + reg.comps() as usize);
+        assert!(range.end <= 4);
+        (idx, range)
+    }
+
+    fn min_queue_position(&self, reg: RegRef) -> Option<u8> {
+        let (idx, range) = Self::reg_ref_to_coords(reg);
+
+        self.queue_pos
+            .get(&idx)
+            .and_then(|x| x.min_queue_position(range))
+    }
+
+    fn is_queue_full(&self) -> bool {
+        // MAX_TEXTURES_LEFT describes the maximum number encodable
+        // in the texdepbar, but the queue must have an element more.
+        self.max_queue_len > OpTexDepBar::MAX_TEXTURES_LEFT
+    }
+
+    /// Flush every element whose position >= pos
+    ///
+    /// Effectively simulates the execution of a `texdepbar pos`
+    fn flush_after(&mut self, pos: u8) {
+        self.max_queue_len = self.max_queue_len.min(pos);
+        self.queue_pos.retain(|_, v| v.flush_after(pos));
+    }
+
+    pub fn push(&mut self, reg: RegRef) -> Option<u8> {
+        // Assert we are not on the queue
+        debug_assert!(self.min_queue_position(reg).is_none());
+
+        // Check that the push operation does not overflow the queue,
+        // if it does, we must insert a barrier
+        let mut tex_bar = None;
+        if self.is_queue_full() {
+            // The queue is full, there are 64 in-flight tex-ops.
+            // make space by making removing 1 texture.
+            tex_bar = Some(OpTexDepBar::MAX_TEXTURES_LEFT);
+            self.flush_after(OpTexDepBar::MAX_TEXTURES_LEFT);
+            // Now the queue is not full anymore
+            debug_assert!(!self.is_queue_full());
+        }
+
+        self.max_queue_len += 1;
+        // Every entry is pushed by 1
+        for x in self.queue_pos.values_mut() {
+            x.push();
+        }
+
+        // Put us on the queue as first
+        let (idx, range) = Self::reg_ref_to_coords(reg);
+        self.queue_pos
+            .entry(idx)
+            .or_insert(TexQueueSimulationBucket::EMPTY)
+            .set_as_first(range);
+
+        tex_bar
+    }
+
+    pub fn flush(&mut self, reg: RegRef) -> Option<u8> {
+        let queue_pos = self.min_queue_position(reg);
+
+        let Some(queue_pos) = queue_pos else {
+            return None; // Not in queue
+        };
+
+        // Cut the queue
+        self.flush_after(queue_pos);
+        debug_assert!(self.min_queue_position(reg).is_none());
+
+        Some(queue_pos)
+    }
+
+    pub fn merge(&mut self, other: &Self) {
+        self.max_queue_len = self.max_queue_len.max(other.max_queue_len);
+        for (key, y) in other.queue_pos.iter() {
+            let x = self
+                .queue_pos
+                .entry(*key)
+                .or_insert(TexQueueSimulationBucket::EMPTY);
+            x.merge(y);
+        }
+    }
+
+    /// Simulates the execution of an instruction and returns the
+    /// barrier level needed.
+    pub fn visit_instr(&mut self, instr: &Instr) -> Option<u8> {
+        // Flush register reads and writes
+        // (avoid write-after-write and read-after-write hazards)
+        // Compute the minimum required flush level (for barriers)
+        let flush_level = if !self.queue_pos.is_empty() {
+            let src_refs =
+                instr.srcs().iter().filter_map(|x| x.src_ref.as_reg());
+            let dst_refs = instr.dsts().iter().filter_map(|x| x.as_reg());
+
+            src_refs
+                .chain(dst_refs)
+                .filter_map(|reg_ref| self.flush(*reg_ref))
+                .reduce(|a, b| a.min(b))
+        } else {
+            // The queue is empty, no need to check the instruction
+            None
+        };
+
+        // Push registers (if we are a tex instruction)
+        // We might need to insert a barrier if the queue is full
+        let push_level = if instr_needs_texbar(&instr) {
+            let dst = instr.dsts()[0].as_reg().unwrap();
+            self.push(*dst)
+        } else {
+            None
+        };
+
+        // If the flush needs a barrier, the queue will not be full,
+        // therefore the push will not need a barrier.
+        debug_assert!(!flush_level.is_some() || !push_level.is_some());
+        flush_level.or(push_level)
+    }
+}
+
+fn instr_needs_texbar(instr: &Instr) -> bool {
+    matches!(
+        instr.op,
+        Op::Tex(_)
+            | Op::Tld(_)
+            | Op::Tmml(_)
+            | Op::Tld4(_)
+            | Op::Txd(_)
+            | Op::Txq(_)
+    )
+}
+
+/// Hardware has a FIFO queue of texture that are still fetching,
+/// when the oldest tex finishes executing, it's written to the reg,
+/// removed from the queue and it begins executing the new one.
+/// The problem arises when a texture is read while it is still being fetched
+/// to avoid it, we have a `texdepbar {i}` instruction that stalls until
+/// the texture fetch queue has at most {i} elements.
+/// e.g. the most simple solution is to have texdepbar 0 after each texture
+/// instruction, but this would stall the pipeline until the texture fetch
+/// finishes executing.
+/// This algorithm inserts `texdepbar` at each use of the texture results,
+/// simulating the texture queue execution.
+///
+/// Note that the texture queue has for each entry (texture data, register output)
+/// and each register can be on the queue only once (we don't want to have multiple texture
+/// operations in-flight that write to the same registers).
+/// This can lead to a neat algorithm:
+/// instead of tracking the queue directly, which can exponentially explode in complexity,
+/// track the position of each register, which needs at most 255/63 positions.
+/// For branches the state is duplicated in each basic block,
+/// for joins instead we want to keep both the minimum position of each
+/// entry and the maximum length og the queue to avoid overflows.
+///
+/// TODO: IF this pass is too slow, there are still optimizations left:
+/// - Our data-flow computes barrier levels and discards them,
+///   but since most CFG blocks do not need recomputation, we could save
+///   the barrier levels in a vec and save a pass later.
+/// - Instead of pushing by 1 each element in the queue on a `push` op,
+///   we could keep track of an in-flight range and use a wrapping timestamp
+///   this improves performance but needs careful implementation to avoid bugs
+fn insert_texture_barriers(f: &mut Function, sm: &dyn ShaderModel) {
+    assert!(sm.is_kepler()); // Only kepler has texture barriers!
+
+    let mut state_in: Vec<_> = (0..f.blocks.len())
+        .map(|_| TexQueueSimulationState::new())
+        .collect();
+    let mut state_out: Vec<_> = (0..f.blocks.len())
+        .map(|_| TexQueueSimulationState::new())
+        .collect();
+    ForwardDataflow {
+        cfg: &f.blocks,
+        block_in: &mut state_in[..],
+        block_out: &mut state_out[..],
+        transfer: |_block_idx, block, sim_out, sim_in| {
+            let mut sim = sim_in.clone();
+
+            for instr in block.instrs.iter() {
+                // Ignore the barrier, we will recompute this later
+                let _bar = sim.visit_instr(&instr);
+            }
+
+            if *sim_out == sim {
+                false
+            } else {
+                *sim_out = sim;
+                true
+            }
+        },
+        join: |sim_out, pred_sim_in| {
+            sim_out.merge(pred_sim_in);
+        },
+    }
+    .solve();
+
+    for (block, mut sim) in f.blocks.iter_mut().zip(state_in.into_iter()) {
+        block.map_instrs(|instr| {
+            if let Some(textures_left) = sim.visit_instr(&instr) {
+                let bar = Instr::new_boxed(OpTexDepBar { textures_left });
+                MappedInstrs::Many(vec![bar, instr])
+            } else {
+                MappedInstrs::One(instr)
+            }
+        });
+    }
+}
+
 fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
    let mut uses = Box::new(RegTracker::new_with(&|| RegUse::None));
    let mut deps = DepGraph::new();
@ -502,6 +840,12 @@ impl Shader<'_> {
    }

    pub fn calc_instr_deps(&mut self) {
+        if self.sm.is_kepler() {
+            for f in &mut self.functions {
+                insert_texture_barriers(f, self.sm);
+            }
+        }
+
        if DEBUG.serial() {
            self.assign_deps_serial();
        } else {
@ -524,3 +868,82 @@ impl Shader<'_> {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn reg_gpr(range: Range<usize>) -> RegRef {
+        RegRef::new(
+            RegFile::GPR,
+            range.start as u32,
+            (range.end - range.start) as u8,
+        )
+    }
+
+    #[test]
+    fn test_texdepbar_basic() {
+        let mut sim = TexQueueSimulationState::new();
+
+        // RaW
+        assert_eq!(sim.push(reg_gpr(0..4)), None);
+        assert_eq!(sim.flush(reg_gpr(2..3)), Some(0));
+
+        // 2 entries in the queue
+        assert_eq!(sim.push(reg_gpr(0..2)), None); // [A]
+        assert_eq!(sim.push(reg_gpr(2..4)), None); // [B, A]
+        assert_eq!(sim.flush(reg_gpr(0..1)), Some(1)); // [B]
+        assert_eq!(sim.flush(reg_gpr(3..4)), Some(0)); // []
+
+        // Test bucket conflicts
+        assert_eq!(sim.push(reg_gpr(0..1)), None);
+        assert_eq!(sim.flush(reg_gpr(1..3)), None);
+        assert_eq!(sim.flush(reg_gpr(0..3)), Some(0));
+
+        // Bucket conflict part 2: Electric Boogaloo
+        assert_eq!(sim.push(reg_gpr(1..2)), None);
+        assert_eq!(sim.push(reg_gpr(0..1)), None);
+        assert_eq!(sim.flush(reg_gpr(1..2)), Some(1));
+        assert_eq!(sim.flush(reg_gpr(0..1)), Some(0));
+
+        // Interesting CFG case that the old pass got wrong.
+        // CFG: A -> [B, C] -> D
+        // A pushes
+        assert_eq!(sim.push(reg_gpr(0..4)), None);
+        // B: pushes a tex then flushes it
+        let mut b_sim = sim.clone();
+        assert_eq!(b_sim.push(reg_gpr(4..8)), None);
+        assert_eq!(b_sim.flush(reg_gpr(4..8)), Some(0));
+        // C: pushes 3 tex and never flishes them
+        let mut c_sim = sim.clone();
+        assert_eq!(c_sim.push(reg_gpr(4..5)), None);
+        assert_eq!(c_sim.push(reg_gpr(5..6)), None);
+        assert_eq!(c_sim.push(reg_gpr(6..7)), None);
+        // D: flushes the tex pushed by A
+        let mut d_sim = b_sim;
+        d_sim.merge(&mut c_sim);
+        assert_eq!(c_sim.flush(reg_gpr(0..4)), Some(3));
+        // the "shortest push path" would pass by B but in fact
+        // by passing in B our texture is flushed off the queue.
+        // (old algorithm would insert a texdepbar 1)
+    }
+
+    #[test]
+    fn test_texdepbar_overflow() {
+        let mut sim = TexQueueSimulationState::new();
+
+        // Fill the texture queue
+        for i in 0..(usize::from(OpTexDepBar::MAX_TEXTURES_LEFT) + 1) {
+            assert_eq!(sim.push(reg_gpr(i..(i + 1))), None);
+        }
+        // The new push would overflow the queue, we NEED a barrier
+        assert_eq!(
+            sim.push(reg_gpr(64..65)),
+            Some(OpTexDepBar::MAX_TEXTURES_LEFT)
+        );
+        assert_eq!(
+            sim.push(reg_gpr(65..66)),
+            Some(OpTexDepBar::MAX_TEXTURES_LEFT)
+        );
+    }
+}
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@ -1994,12 +1994,6 @@ impl<'a> ShaderFromNir<'a> {
            }
        }

-        if self.sm.sm() < 50 {
-            // TODO: texbar should be created by calc_instr_deps() and
-            // should be less conservative than textures_left=0.
-            // See the old pass: NVC0LegalizePostRA::insertTextureBarriers
-            b.push_op(OpTexDepBar { textures_left: 0 });
-        }
        self.set_ssa(tex.def.as_def(), nir_dst);
    }

--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@ -6945,7 +6945,15 @@ impl_display_for_op!(OpBar);
 #[repr(C)]
 #[derive(SrcsAsSlice, DstsAsSlice)]
 pub struct OpTexDepBar {
-    pub textures_left: i8,
+    pub textures_left: u8,
+}
+
+impl OpTexDepBar {
+    /// Maximum value of textures_left
+    ///
+    /// The maximum encodable value is 63.  However, nvcc starts emitting
+    /// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions.
+    pub const MAX_TEXTURES_LEFT: u8 = 62;
 }

 impl DisplayOp for OpTexDepBar {