From f2e6bacafd4b5c2c7214f633f0b77acfcb64bde6 Mon Sep 17 00:00:00 2001 From: Lorenzo Rossi Date: Tue, 15 Jul 2025 21:57:00 +0200 Subject: [PATCH] nak/kepler: Add texdepbar insertion pass This commit adds a forward data-flow pass to insert texdepbar before using registers of texture fetch instructions. The new algorithm started as a port of the old codegen pass, but finished in a complete rewrite that is substantially simpler and should generate less conservative code in some edge cases. Signed-off-by: Lorenzo Rossi Part-of: --- src/nouveau/compiler/nak/calc_instr_deps.rs | 425 +++++++++++++++++++- src/nouveau/compiler/nak/from_nir.rs | 6 - src/nouveau/compiler/nak/ir.rs | 10 +- 3 files changed, 433 insertions(+), 8 deletions(-) diff --git a/src/nouveau/compiler/nak/calc_instr_deps.rs b/src/nouveau/compiler/nak/calc_instr_deps.rs index 647e37668fa..d5cd3494763 100644 --- a/src/nouveau/compiler/nak/calc_instr_deps.rs +++ b/src/nouveau/compiler/nak/calc_instr_deps.rs @@ -2,12 +2,14 @@ // SPDX-License-Identifier: MIT use crate::api::{GetDebugFlags, DEBUG}; +use crate::dataflow::ForwardDataflow; use crate::ir::*; use crate::reg_tracker::RegTracker; use rustc_hash::{FxHashMap, FxHashSet}; use std::cmp::max; -use std::slice; +use std::ops::Range; +use std::{slice, u32, u8}; #[derive(Clone)] enum RegUse { @@ -229,6 +231,342 @@ impl BarAlloc { } } +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +struct TexQueueSimulationEntry { + min_pos: u8, +} + +impl TexQueueSimulationEntry { + const INVALID: Self = TexQueueSimulationEntry { min_pos: u8::MAX }; + + // First element on the queue + const FIRST: Self = TexQueueSimulationEntry { min_pos: 0 }; + + fn is_valid(&self) -> bool { + if *self == Self::INVALID { + false + } else { + debug_assert!(self.min_pos <= OpTexDepBar::MAX_TEXTURES_LEFT); + true + } + } + + fn push(&mut self) { + if self.is_valid() { + self.min_pos += 1; + } + } + + fn flush_after(&mut self, pos: u8) -> bool { + if self.min_pos < pos { + true + } else { + // This entry is either invalid or higher than the cull level + *self = Self::INVALID; + false + } + } + + fn merge(&mut self, other: &Self) { + self.min_pos = self.min_pos.min(other.min_pos); + } +} + +/// Simulate the state of a register in the queue, in buckets of 4 +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +struct TexQueueSimulationBucket { + entries: [TexQueueSimulationEntry; 4], +} + +impl TexQueueSimulationBucket { + const EMPTY: Self = TexQueueSimulationBucket { + entries: [TexQueueSimulationEntry::INVALID; 4], + }; + + fn min_queue_position(&self, range: Range) -> Option { + self.entries[range] + .iter() + .filter(|x| x.is_valid()) + .map(|x| x.min_pos) + .min() + } + + fn set_as_first(&mut self, range: Range) { + for i in range { + debug_assert!(!self.entries[i].is_valid()); + self.entries[i] = TexQueueSimulationEntry::FIRST; + } + } + + fn push(&mut self) { + for entry in &mut self.entries { + entry.push(); + } + } + + fn flush_after(&mut self, pos: u8) -> bool { + debug_assert!(pos <= OpTexDepBar::MAX_TEXTURES_LEFT); + + let mut retain = false; + for x in &mut self.entries { + retain |= x.flush_after(pos); + } + retain + } + + fn merge(&mut self, other: &Self) { + for (x, y) in self.entries.iter_mut().zip(other.entries.iter()) { + x.merge(y); + } + } +} + +/// This state simulates the texture queue for each destination. +/// +/// For example, at the start the queue is always empty, but if we encounter a +/// tex operation that writes in r4..r8, that is pushed on the queue at +/// position 0. If we encounter another tex operation that only writes r5, +/// that will be pushed at position 0 and the old tex instruction will be in +/// position 1. This data-structure keeps track of the position of the queue +/// for each destination register present in the queue, push operations +/// correspond to new texture instructions, while flush operations correspond to +/// the usage of registers which may still be on the queue. +/// +/// Since all Kepler texture operations use at most 4 registers, and many +/// instruction use more than one destination at a time, we group registers in +/// buckets of 4. With this optimization each RegRef only accesses a single +/// bucket. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TexQueueSimulationState { + /// Min position of the destination register in the queue, + /// in buckets of 4 (indexed by register_index / 4). + queue_pos: FxHashMap, + /// Max length of the queue, needed to check for overflows + max_queue_len: u8, +} + +impl TexQueueSimulationState { + pub fn new() -> Self { + TexQueueSimulationState { + queue_pos: Default::default(), + max_queue_len: 0, + } + } + + /// Translate from RegRef to bucket_index + bucket_range + #[inline] + fn reg_ref_to_coords(reg: RegRef) -> (u8, Range) { + debug_assert!(reg.base_idx() <= u8::MAX.into()); + let idx = reg.base_idx() as u8 / 4; + let sub = (reg.base_idx() % 4) as usize; + + let range = sub..(sub + reg.comps() as usize); + assert!(range.end <= 4); + (idx, range) + } + + fn min_queue_position(&self, reg: RegRef) -> Option { + let (idx, range) = Self::reg_ref_to_coords(reg); + + self.queue_pos + .get(&idx) + .and_then(|x| x.min_queue_position(range)) + } + + fn is_queue_full(&self) -> bool { + // MAX_TEXTURES_LEFT describes the maximum number encodable + // in the texdepbar, but the queue must have an element more. + self.max_queue_len > OpTexDepBar::MAX_TEXTURES_LEFT + } + + /// Flush every element whose position >= pos + /// + /// Effectively simulates the execution of a `texdepbar pos` + fn flush_after(&mut self, pos: u8) { + self.max_queue_len = self.max_queue_len.min(pos); + self.queue_pos.retain(|_, v| v.flush_after(pos)); + } + + pub fn push(&mut self, reg: RegRef) -> Option { + // Assert we are not on the queue + debug_assert!(self.min_queue_position(reg).is_none()); + + // Check that the push operation does not overflow the queue, + // if it does, we must insert a barrier + let mut tex_bar = None; + if self.is_queue_full() { + // The queue is full, there are 64 in-flight tex-ops. + // make space by making removing 1 texture. + tex_bar = Some(OpTexDepBar::MAX_TEXTURES_LEFT); + self.flush_after(OpTexDepBar::MAX_TEXTURES_LEFT); + // Now the queue is not full anymore + debug_assert!(!self.is_queue_full()); + } + + self.max_queue_len += 1; + // Every entry is pushed by 1 + for x in self.queue_pos.values_mut() { + x.push(); + } + + // Put us on the queue as first + let (idx, range) = Self::reg_ref_to_coords(reg); + self.queue_pos + .entry(idx) + .or_insert(TexQueueSimulationBucket::EMPTY) + .set_as_first(range); + + tex_bar + } + + pub fn flush(&mut self, reg: RegRef) -> Option { + let queue_pos = self.min_queue_position(reg); + + let Some(queue_pos) = queue_pos else { + return None; // Not in queue + }; + + // Cut the queue + self.flush_after(queue_pos); + debug_assert!(self.min_queue_position(reg).is_none()); + + Some(queue_pos) + } + + pub fn merge(&mut self, other: &Self) { + self.max_queue_len = self.max_queue_len.max(other.max_queue_len); + for (key, y) in other.queue_pos.iter() { + let x = self + .queue_pos + .entry(*key) + .or_insert(TexQueueSimulationBucket::EMPTY); + x.merge(y); + } + } + + /// Simulates the execution of an instruction and returns the + /// barrier level needed. + pub fn visit_instr(&mut self, instr: &Instr) -> Option { + // Flush register reads and writes + // (avoid write-after-write and read-after-write hazards) + // Compute the minimum required flush level (for barriers) + let flush_level = if !self.queue_pos.is_empty() { + let src_refs = + instr.srcs().iter().filter_map(|x| x.src_ref.as_reg()); + let dst_refs = instr.dsts().iter().filter_map(|x| x.as_reg()); + + src_refs + .chain(dst_refs) + .filter_map(|reg_ref| self.flush(*reg_ref)) + .reduce(|a, b| a.min(b)) + } else { + // The queue is empty, no need to check the instruction + None + }; + + // Push registers (if we are a tex instruction) + // We might need to insert a barrier if the queue is full + let push_level = if instr_needs_texbar(&instr) { + let dst = instr.dsts()[0].as_reg().unwrap(); + self.push(*dst) + } else { + None + }; + + // If the flush needs a barrier, the queue will not be full, + // therefore the push will not need a barrier. + debug_assert!(!flush_level.is_some() || !push_level.is_some()); + flush_level.or(push_level) + } +} + +fn instr_needs_texbar(instr: &Instr) -> bool { + matches!( + instr.op, + Op::Tex(_) + | Op::Tld(_) + | Op::Tmml(_) + | Op::Tld4(_) + | Op::Txd(_) + | Op::Txq(_) + ) +} + +/// Hardware has a FIFO queue of texture that are still fetching, +/// when the oldest tex finishes executing, it's written to the reg, +/// removed from the queue and it begins executing the new one. +/// The problem arises when a texture is read while it is still being fetched +/// to avoid it, we have a `texdepbar {i}` instruction that stalls until +/// the texture fetch queue has at most {i} elements. +/// e.g. the most simple solution is to have texdepbar 0 after each texture +/// instruction, but this would stall the pipeline until the texture fetch +/// finishes executing. +/// This algorithm inserts `texdepbar` at each use of the texture results, +/// simulating the texture queue execution. +/// +/// Note that the texture queue has for each entry (texture data, register output) +/// and each register can be on the queue only once (we don't want to have multiple texture +/// operations in-flight that write to the same registers). +/// This can lead to a neat algorithm: +/// instead of tracking the queue directly, which can exponentially explode in complexity, +/// track the position of each register, which needs at most 255/63 positions. +/// For branches the state is duplicated in each basic block, +/// for joins instead we want to keep both the minimum position of each +/// entry and the maximum length og the queue to avoid overflows. +/// +/// TODO: IF this pass is too slow, there are still optimizations left: +/// - Our data-flow computes barrier levels and discards them, +/// but since most CFG blocks do not need recomputation, we could save +/// the barrier levels in a vec and save a pass later. +/// - Instead of pushing by 1 each element in the queue on a `push` op, +/// we could keep track of an in-flight range and use a wrapping timestamp +/// this improves performance but needs careful implementation to avoid bugs +fn insert_texture_barriers(f: &mut Function, sm: &dyn ShaderModel) { + assert!(sm.is_kepler()); // Only kepler has texture barriers! + + let mut state_in: Vec<_> = (0..f.blocks.len()) + .map(|_| TexQueueSimulationState::new()) + .collect(); + let mut state_out: Vec<_> = (0..f.blocks.len()) + .map(|_| TexQueueSimulationState::new()) + .collect(); + ForwardDataflow { + cfg: &f.blocks, + block_in: &mut state_in[..], + block_out: &mut state_out[..], + transfer: |_block_idx, block, sim_out, sim_in| { + let mut sim = sim_in.clone(); + + for instr in block.instrs.iter() { + // Ignore the barrier, we will recompute this later + let _bar = sim.visit_instr(&instr); + } + + if *sim_out == sim { + false + } else { + *sim_out = sim; + true + } + }, + join: |sim_out, pred_sim_in| { + sim_out.merge(pred_sim_in); + }, + } + .solve(); + + for (block, mut sim) in f.blocks.iter_mut().zip(state_in.into_iter()) { + block.map_instrs(|instr| { + if let Some(textures_left) = sim.visit_instr(&instr) { + let bar = Instr::new_boxed(OpTexDepBar { textures_left }); + MappedInstrs::Many(vec![bar, instr]) + } else { + MappedInstrs::One(instr) + } + }); + } +} + fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) { let mut uses = Box::new(RegTracker::new_with(&|| RegUse::None)); let mut deps = DepGraph::new(); @@ -502,6 +840,12 @@ impl Shader<'_> { } pub fn calc_instr_deps(&mut self) { + if self.sm.is_kepler() { + for f in &mut self.functions { + insert_texture_barriers(f, self.sm); + } + } + if DEBUG.serial() { self.assign_deps_serial(); } else { @@ -524,3 +868,82 @@ impl Shader<'_> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn reg_gpr(range: Range) -> RegRef { + RegRef::new( + RegFile::GPR, + range.start as u32, + (range.end - range.start) as u8, + ) + } + + #[test] + fn test_texdepbar_basic() { + let mut sim = TexQueueSimulationState::new(); + + // RaW + assert_eq!(sim.push(reg_gpr(0..4)), None); + assert_eq!(sim.flush(reg_gpr(2..3)), Some(0)); + + // 2 entries in the queue + assert_eq!(sim.push(reg_gpr(0..2)), None); // [A] + assert_eq!(sim.push(reg_gpr(2..4)), None); // [B, A] + assert_eq!(sim.flush(reg_gpr(0..1)), Some(1)); // [B] + assert_eq!(sim.flush(reg_gpr(3..4)), Some(0)); // [] + + // Test bucket conflicts + assert_eq!(sim.push(reg_gpr(0..1)), None); + assert_eq!(sim.flush(reg_gpr(1..3)), None); + assert_eq!(sim.flush(reg_gpr(0..3)), Some(0)); + + // Bucket conflict part 2: Electric Boogaloo + assert_eq!(sim.push(reg_gpr(1..2)), None); + assert_eq!(sim.push(reg_gpr(0..1)), None); + assert_eq!(sim.flush(reg_gpr(1..2)), Some(1)); + assert_eq!(sim.flush(reg_gpr(0..1)), Some(0)); + + // Interesting CFG case that the old pass got wrong. + // CFG: A -> [B, C] -> D + // A pushes + assert_eq!(sim.push(reg_gpr(0..4)), None); + // B: pushes a tex then flushes it + let mut b_sim = sim.clone(); + assert_eq!(b_sim.push(reg_gpr(4..8)), None); + assert_eq!(b_sim.flush(reg_gpr(4..8)), Some(0)); + // C: pushes 3 tex and never flishes them + let mut c_sim = sim.clone(); + assert_eq!(c_sim.push(reg_gpr(4..5)), None); + assert_eq!(c_sim.push(reg_gpr(5..6)), None); + assert_eq!(c_sim.push(reg_gpr(6..7)), None); + // D: flushes the tex pushed by A + let mut d_sim = b_sim; + d_sim.merge(&mut c_sim); + assert_eq!(c_sim.flush(reg_gpr(0..4)), Some(3)); + // the "shortest push path" would pass by B but in fact + // by passing in B our texture is flushed off the queue. + // (old algorithm would insert a texdepbar 1) + } + + #[test] + fn test_texdepbar_overflow() { + let mut sim = TexQueueSimulationState::new(); + + // Fill the texture queue + for i in 0..(usize::from(OpTexDepBar::MAX_TEXTURES_LEFT) + 1) { + assert_eq!(sim.push(reg_gpr(i..(i + 1))), None); + } + // The new push would overflow the queue, we NEED a barrier + assert_eq!( + sim.push(reg_gpr(64..65)), + Some(OpTexDepBar::MAX_TEXTURES_LEFT) + ); + assert_eq!( + sim.push(reg_gpr(65..66)), + Some(OpTexDepBar::MAX_TEXTURES_LEFT) + ); + } +} diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index a699926a6a3..e5946a3b874 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -1994,12 +1994,6 @@ impl<'a> ShaderFromNir<'a> { } } - if self.sm.sm() < 50 { - // TODO: texbar should be created by calc_instr_deps() and - // should be less conservative than textures_left=0. - // See the old pass: NVC0LegalizePostRA::insertTextureBarriers - b.push_op(OpTexDepBar { textures_left: 0 }); - } self.set_ssa(tex.def.as_def(), nir_dst); } diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 49a156a24e1..4fc1fce99ec 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -6945,7 +6945,15 @@ impl_display_for_op!(OpBar); #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpTexDepBar { - pub textures_left: i8, + pub textures_left: u8, +} + +impl OpTexDepBar { + /// Maximum value of textures_left + /// + /// The maximum encodable value is 63. However, nvcc starts emitting + /// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions. + pub const MAX_TEXTURES_LEFT: u8 = 62; } impl DisplayOp for OpTexDepBar {