nak/kepler: Add texdepbar insertion pass

This commit adds a forward data-flow pass to insert
texdepbar before using registers of texture fetch instructions.
The new algorithm started as a port of the old codegen pass, but finished
in a complete rewrite that is substantially simpler and should generate
less conservative code in some edge cases.

Signed-off-by: Lorenzo Rossi <snowycoder@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35403>
This commit is contained in:
Lorenzo Rossi 2025-07-15 21:57:00 +02:00 committed by Marge Bot
parent fbeb70cbbc
commit f2e6bacafd
3 changed files with 433 additions and 8 deletions

View file

@ -2,12 +2,14 @@
// SPDX-License-Identifier: MIT
use crate::api::{GetDebugFlags, DEBUG};
use crate::dataflow::ForwardDataflow;
use crate::ir::*;
use crate::reg_tracker::RegTracker;
use rustc_hash::{FxHashMap, FxHashSet};
use std::cmp::max;
use std::slice;
use std::ops::Range;
use std::{slice, u32, u8};
#[derive(Clone)]
enum RegUse<T: Clone> {
@ -229,6 +231,342 @@ impl BarAlloc {
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
struct TexQueueSimulationEntry {
min_pos: u8,
}
impl TexQueueSimulationEntry {
const INVALID: Self = TexQueueSimulationEntry { min_pos: u8::MAX };
// First element on the queue
const FIRST: Self = TexQueueSimulationEntry { min_pos: 0 };
fn is_valid(&self) -> bool {
if *self == Self::INVALID {
false
} else {
debug_assert!(self.min_pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
true
}
}
fn push(&mut self) {
if self.is_valid() {
self.min_pos += 1;
}
}
fn flush_after(&mut self, pos: u8) -> bool {
if self.min_pos < pos {
true
} else {
// This entry is either invalid or higher than the cull level
*self = Self::INVALID;
false
}
}
fn merge(&mut self, other: &Self) {
self.min_pos = self.min_pos.min(other.min_pos);
}
}
/// Simulate the state of a register in the queue, in buckets of 4
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
struct TexQueueSimulationBucket {
entries: [TexQueueSimulationEntry; 4],
}
impl TexQueueSimulationBucket {
const EMPTY: Self = TexQueueSimulationBucket {
entries: [TexQueueSimulationEntry::INVALID; 4],
};
fn min_queue_position(&self, range: Range<usize>) -> Option<u8> {
self.entries[range]
.iter()
.filter(|x| x.is_valid())
.map(|x| x.min_pos)
.min()
}
fn set_as_first(&mut self, range: Range<usize>) {
for i in range {
debug_assert!(!self.entries[i].is_valid());
self.entries[i] = TexQueueSimulationEntry::FIRST;
}
}
fn push(&mut self) {
for entry in &mut self.entries {
entry.push();
}
}
fn flush_after(&mut self, pos: u8) -> bool {
debug_assert!(pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
let mut retain = false;
for x in &mut self.entries {
retain |= x.flush_after(pos);
}
retain
}
fn merge(&mut self, other: &Self) {
for (x, y) in self.entries.iter_mut().zip(other.entries.iter()) {
x.merge(y);
}
}
}
/// This state simulates the texture queue for each destination.
///
/// For example, at the start the queue is always empty, but if we encounter a
/// tex operation that writes in r4..r8, that is pushed on the queue at
/// position 0. If we encounter another tex operation that only writes r5,
/// that will be pushed at position 0 and the old tex instruction will be in
/// position 1. This data-structure keeps track of the position of the queue
/// for each destination register present in the queue, push operations
/// correspond to new texture instructions, while flush operations correspond to
/// the usage of registers which may still be on the queue.
///
/// Since all Kepler texture operations use at most 4 registers, and many
/// instruction use more than one destination at a time, we group registers in
/// buckets of 4. With this optimization each RegRef only accesses a single
/// bucket.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TexQueueSimulationState {
/// Min position of the destination register in the queue,
/// in buckets of 4 (indexed by register_index / 4).
queue_pos: FxHashMap<u8, TexQueueSimulationBucket>,
/// Max length of the queue, needed to check for overflows
max_queue_len: u8,
}
impl TexQueueSimulationState {
pub fn new() -> Self {
TexQueueSimulationState {
queue_pos: Default::default(),
max_queue_len: 0,
}
}
/// Translate from RegRef to bucket_index + bucket_range
#[inline]
fn reg_ref_to_coords(reg: RegRef) -> (u8, Range<usize>) {
debug_assert!(reg.base_idx() <= u8::MAX.into());
let idx = reg.base_idx() as u8 / 4;
let sub = (reg.base_idx() % 4) as usize;
let range = sub..(sub + reg.comps() as usize);
assert!(range.end <= 4);
(idx, range)
}
fn min_queue_position(&self, reg: RegRef) -> Option<u8> {
let (idx, range) = Self::reg_ref_to_coords(reg);
self.queue_pos
.get(&idx)
.and_then(|x| x.min_queue_position(range))
}
fn is_queue_full(&self) -> bool {
// MAX_TEXTURES_LEFT describes the maximum number encodable
// in the texdepbar, but the queue must have an element more.
self.max_queue_len > OpTexDepBar::MAX_TEXTURES_LEFT
}
/// Flush every element whose position >= pos
///
/// Effectively simulates the execution of a `texdepbar pos`
fn flush_after(&mut self, pos: u8) {
self.max_queue_len = self.max_queue_len.min(pos);
self.queue_pos.retain(|_, v| v.flush_after(pos));
}
pub fn push(&mut self, reg: RegRef) -> Option<u8> {
// Assert we are not on the queue
debug_assert!(self.min_queue_position(reg).is_none());
// Check that the push operation does not overflow the queue,
// if it does, we must insert a barrier
let mut tex_bar = None;
if self.is_queue_full() {
// The queue is full, there are 64 in-flight tex-ops.
// make space by making removing 1 texture.
tex_bar = Some(OpTexDepBar::MAX_TEXTURES_LEFT);
self.flush_after(OpTexDepBar::MAX_TEXTURES_LEFT);
// Now the queue is not full anymore
debug_assert!(!self.is_queue_full());
}
self.max_queue_len += 1;
// Every entry is pushed by 1
for x in self.queue_pos.values_mut() {
x.push();
}
// Put us on the queue as first
let (idx, range) = Self::reg_ref_to_coords(reg);
self.queue_pos
.entry(idx)
.or_insert(TexQueueSimulationBucket::EMPTY)
.set_as_first(range);
tex_bar
}
pub fn flush(&mut self, reg: RegRef) -> Option<u8> {
let queue_pos = self.min_queue_position(reg);
let Some(queue_pos) = queue_pos else {
return None; // Not in queue
};
// Cut the queue
self.flush_after(queue_pos);
debug_assert!(self.min_queue_position(reg).is_none());
Some(queue_pos)
}
pub fn merge(&mut self, other: &Self) {
self.max_queue_len = self.max_queue_len.max(other.max_queue_len);
for (key, y) in other.queue_pos.iter() {
let x = self
.queue_pos
.entry(*key)
.or_insert(TexQueueSimulationBucket::EMPTY);
x.merge(y);
}
}
/// Simulates the execution of an instruction and returns the
/// barrier level needed.
pub fn visit_instr(&mut self, instr: &Instr) -> Option<u8> {
// Flush register reads and writes
// (avoid write-after-write and read-after-write hazards)
// Compute the minimum required flush level (for barriers)
let flush_level = if !self.queue_pos.is_empty() {
let src_refs =
instr.srcs().iter().filter_map(|x| x.src_ref.as_reg());
let dst_refs = instr.dsts().iter().filter_map(|x| x.as_reg());
src_refs
.chain(dst_refs)
.filter_map(|reg_ref| self.flush(*reg_ref))
.reduce(|a, b| a.min(b))
} else {
// The queue is empty, no need to check the instruction
None
};
// Push registers (if we are a tex instruction)
// We might need to insert a barrier if the queue is full
let push_level = if instr_needs_texbar(&instr) {
let dst = instr.dsts()[0].as_reg().unwrap();
self.push(*dst)
} else {
None
};
// If the flush needs a barrier, the queue will not be full,
// therefore the push will not need a barrier.
debug_assert!(!flush_level.is_some() || !push_level.is_some());
flush_level.or(push_level)
}
}
fn instr_needs_texbar(instr: &Instr) -> bool {
matches!(
instr.op,
Op::Tex(_)
| Op::Tld(_)
| Op::Tmml(_)
| Op::Tld4(_)
| Op::Txd(_)
| Op::Txq(_)
)
}
/// Hardware has a FIFO queue of texture that are still fetching,
/// when the oldest tex finishes executing, it's written to the reg,
/// removed from the queue and it begins executing the new one.
/// The problem arises when a texture is read while it is still being fetched
/// to avoid it, we have a `texdepbar {i}` instruction that stalls until
/// the texture fetch queue has at most {i} elements.
/// e.g. the most simple solution is to have texdepbar 0 after each texture
/// instruction, but this would stall the pipeline until the texture fetch
/// finishes executing.
/// This algorithm inserts `texdepbar` at each use of the texture results,
/// simulating the texture queue execution.
///
/// Note that the texture queue has for each entry (texture data, register output)
/// and each register can be on the queue only once (we don't want to have multiple texture
/// operations in-flight that write to the same registers).
/// This can lead to a neat algorithm:
/// instead of tracking the queue directly, which can exponentially explode in complexity,
/// track the position of each register, which needs at most 255/63 positions.
/// For branches the state is duplicated in each basic block,
/// for joins instead we want to keep both the minimum position of each
/// entry and the maximum length og the queue to avoid overflows.
///
/// TODO: IF this pass is too slow, there are still optimizations left:
/// - Our data-flow computes barrier levels and discards them,
/// but since most CFG blocks do not need recomputation, we could save
/// the barrier levels in a vec and save a pass later.
/// - Instead of pushing by 1 each element in the queue on a `push` op,
/// we could keep track of an in-flight range and use a wrapping timestamp
/// this improves performance but needs careful implementation to avoid bugs
fn insert_texture_barriers(f: &mut Function, sm: &dyn ShaderModel) {
assert!(sm.is_kepler()); // Only kepler has texture barriers!
let mut state_in: Vec<_> = (0..f.blocks.len())
.map(|_| TexQueueSimulationState::new())
.collect();
let mut state_out: Vec<_> = (0..f.blocks.len())
.map(|_| TexQueueSimulationState::new())
.collect();
ForwardDataflow {
cfg: &f.blocks,
block_in: &mut state_in[..],
block_out: &mut state_out[..],
transfer: |_block_idx, block, sim_out, sim_in| {
let mut sim = sim_in.clone();
for instr in block.instrs.iter() {
// Ignore the barrier, we will recompute this later
let _bar = sim.visit_instr(&instr);
}
if *sim_out == sim {
false
} else {
*sim_out = sim;
true
}
},
join: |sim_out, pred_sim_in| {
sim_out.merge(pred_sim_in);
},
}
.solve();
for (block, mut sim) in f.blocks.iter_mut().zip(state_in.into_iter()) {
block.map_instrs(|instr| {
if let Some(textures_left) = sim.visit_instr(&instr) {
let bar = Instr::new_boxed(OpTexDepBar { textures_left });
MappedInstrs::Many(vec![bar, instr])
} else {
MappedInstrs::One(instr)
}
});
}
}
fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
let mut uses = Box::new(RegTracker::new_with(&|| RegUse::None));
let mut deps = DepGraph::new();
@ -502,6 +840,12 @@ impl Shader<'_> {
}
pub fn calc_instr_deps(&mut self) {
if self.sm.is_kepler() {
for f in &mut self.functions {
insert_texture_barriers(f, self.sm);
}
}
if DEBUG.serial() {
self.assign_deps_serial();
} else {
@ -524,3 +868,82 @@ impl Shader<'_> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn reg_gpr(range: Range<usize>) -> RegRef {
RegRef::new(
RegFile::GPR,
range.start as u32,
(range.end - range.start) as u8,
)
}
#[test]
fn test_texdepbar_basic() {
let mut sim = TexQueueSimulationState::new();
// RaW
assert_eq!(sim.push(reg_gpr(0..4)), None);
assert_eq!(sim.flush(reg_gpr(2..3)), Some(0));
// 2 entries in the queue
assert_eq!(sim.push(reg_gpr(0..2)), None); // [A]
assert_eq!(sim.push(reg_gpr(2..4)), None); // [B, A]
assert_eq!(sim.flush(reg_gpr(0..1)), Some(1)); // [B]
assert_eq!(sim.flush(reg_gpr(3..4)), Some(0)); // []
// Test bucket conflicts
assert_eq!(sim.push(reg_gpr(0..1)), None);
assert_eq!(sim.flush(reg_gpr(1..3)), None);
assert_eq!(sim.flush(reg_gpr(0..3)), Some(0));
// Bucket conflict part 2: Electric Boogaloo
assert_eq!(sim.push(reg_gpr(1..2)), None);
assert_eq!(sim.push(reg_gpr(0..1)), None);
assert_eq!(sim.flush(reg_gpr(1..2)), Some(1));
assert_eq!(sim.flush(reg_gpr(0..1)), Some(0));
// Interesting CFG case that the old pass got wrong.
// CFG: A -> [B, C] -> D
// A pushes
assert_eq!(sim.push(reg_gpr(0..4)), None);
// B: pushes a tex then flushes it
let mut b_sim = sim.clone();
assert_eq!(b_sim.push(reg_gpr(4..8)), None);
assert_eq!(b_sim.flush(reg_gpr(4..8)), Some(0));
// C: pushes 3 tex and never flishes them
let mut c_sim = sim.clone();
assert_eq!(c_sim.push(reg_gpr(4..5)), None);
assert_eq!(c_sim.push(reg_gpr(5..6)), None);
assert_eq!(c_sim.push(reg_gpr(6..7)), None);
// D: flushes the tex pushed by A
let mut d_sim = b_sim;
d_sim.merge(&mut c_sim);
assert_eq!(c_sim.flush(reg_gpr(0..4)), Some(3));
// the "shortest push path" would pass by B but in fact
// by passing in B our texture is flushed off the queue.
// (old algorithm would insert a texdepbar 1)
}
#[test]
fn test_texdepbar_overflow() {
let mut sim = TexQueueSimulationState::new();
// Fill the texture queue
for i in 0..(usize::from(OpTexDepBar::MAX_TEXTURES_LEFT) + 1) {
assert_eq!(sim.push(reg_gpr(i..(i + 1))), None);
}
// The new push would overflow the queue, we NEED a barrier
assert_eq!(
sim.push(reg_gpr(64..65)),
Some(OpTexDepBar::MAX_TEXTURES_LEFT)
);
assert_eq!(
sim.push(reg_gpr(65..66)),
Some(OpTexDepBar::MAX_TEXTURES_LEFT)
);
}
}

View file

@ -1994,12 +1994,6 @@ impl<'a> ShaderFromNir<'a> {
}
}
if self.sm.sm() < 50 {
// TODO: texbar should be created by calc_instr_deps() and
// should be less conservative than textures_left=0.
// See the old pass: NVC0LegalizePostRA::insertTextureBarriers
b.push_op(OpTexDepBar { textures_left: 0 });
}
self.set_ssa(tex.def.as_def(), nir_dst);
}

View file

@ -6945,7 +6945,15 @@ impl_display_for_op!(OpBar);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpTexDepBar {
pub textures_left: i8,
pub textures_left: u8,
}
impl OpTexDepBar {
/// Maximum value of textures_left
///
/// The maximum encodable value is 63. However, nvcc starts emitting
/// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions.
pub const MAX_TEXTURES_LEFT: u8 = 62;
}
impl DisplayOp for OpTexDepBar {