mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-06 04:08:48 +02:00
nak/kepler: Add texdepbar insertion pass
This commit adds a forward data-flow pass to insert texdepbar before using registers of texture fetch instructions. The new algorithm started as a port of the old codegen pass, but finished in a complete rewrite that is substantially simpler and should generate less conservative code in some edge cases. Signed-off-by: Lorenzo Rossi <snowycoder@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35403>
This commit is contained in:
parent
fbeb70cbbc
commit
f2e6bacafd
3 changed files with 433 additions and 8 deletions
|
|
@ -2,12 +2,14 @@
|
|||
// SPDX-License-Identifier: MIT
|
||||
|
||||
use crate::api::{GetDebugFlags, DEBUG};
|
||||
use crate::dataflow::ForwardDataflow;
|
||||
use crate::ir::*;
|
||||
use crate::reg_tracker::RegTracker;
|
||||
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use std::cmp::max;
|
||||
use std::slice;
|
||||
use std::ops::Range;
|
||||
use std::{slice, u32, u8};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RegUse<T: Clone> {
|
||||
|
|
@ -229,6 +231,342 @@ impl BarAlloc {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
struct TexQueueSimulationEntry {
|
||||
min_pos: u8,
|
||||
}
|
||||
|
||||
impl TexQueueSimulationEntry {
|
||||
const INVALID: Self = TexQueueSimulationEntry { min_pos: u8::MAX };
|
||||
|
||||
// First element on the queue
|
||||
const FIRST: Self = TexQueueSimulationEntry { min_pos: 0 };
|
||||
|
||||
fn is_valid(&self) -> bool {
|
||||
if *self == Self::INVALID {
|
||||
false
|
||||
} else {
|
||||
debug_assert!(self.min_pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self) {
|
||||
if self.is_valid() {
|
||||
self.min_pos += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_after(&mut self, pos: u8) -> bool {
|
||||
if self.min_pos < pos {
|
||||
true
|
||||
} else {
|
||||
// This entry is either invalid or higher than the cull level
|
||||
*self = Self::INVALID;
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn merge(&mut self, other: &Self) {
|
||||
self.min_pos = self.min_pos.min(other.min_pos);
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulate the state of a register in the queue, in buckets of 4
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
struct TexQueueSimulationBucket {
|
||||
entries: [TexQueueSimulationEntry; 4],
|
||||
}
|
||||
|
||||
impl TexQueueSimulationBucket {
|
||||
const EMPTY: Self = TexQueueSimulationBucket {
|
||||
entries: [TexQueueSimulationEntry::INVALID; 4],
|
||||
};
|
||||
|
||||
fn min_queue_position(&self, range: Range<usize>) -> Option<u8> {
|
||||
self.entries[range]
|
||||
.iter()
|
||||
.filter(|x| x.is_valid())
|
||||
.map(|x| x.min_pos)
|
||||
.min()
|
||||
}
|
||||
|
||||
fn set_as_first(&mut self, range: Range<usize>) {
|
||||
for i in range {
|
||||
debug_assert!(!self.entries[i].is_valid());
|
||||
self.entries[i] = TexQueueSimulationEntry::FIRST;
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self) {
|
||||
for entry in &mut self.entries {
|
||||
entry.push();
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_after(&mut self, pos: u8) -> bool {
|
||||
debug_assert!(pos <= OpTexDepBar::MAX_TEXTURES_LEFT);
|
||||
|
||||
let mut retain = false;
|
||||
for x in &mut self.entries {
|
||||
retain |= x.flush_after(pos);
|
||||
}
|
||||
retain
|
||||
}
|
||||
|
||||
fn merge(&mut self, other: &Self) {
|
||||
for (x, y) in self.entries.iter_mut().zip(other.entries.iter()) {
|
||||
x.merge(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This state simulates the texture queue for each destination.
|
||||
///
|
||||
/// For example, at the start the queue is always empty, but if we encounter a
|
||||
/// tex operation that writes in r4..r8, that is pushed on the queue at
|
||||
/// position 0. If we encounter another tex operation that only writes r5,
|
||||
/// that will be pushed at position 0 and the old tex instruction will be in
|
||||
/// position 1. This data-structure keeps track of the position of the queue
|
||||
/// for each destination register present in the queue, push operations
|
||||
/// correspond to new texture instructions, while flush operations correspond to
|
||||
/// the usage of registers which may still be on the queue.
|
||||
///
|
||||
/// Since all Kepler texture operations use at most 4 registers, and many
|
||||
/// instruction use more than one destination at a time, we group registers in
|
||||
/// buckets of 4. With this optimization each RegRef only accesses a single
|
||||
/// bucket.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TexQueueSimulationState {
|
||||
/// Min position of the destination register in the queue,
|
||||
/// in buckets of 4 (indexed by register_index / 4).
|
||||
queue_pos: FxHashMap<u8, TexQueueSimulationBucket>,
|
||||
/// Max length of the queue, needed to check for overflows
|
||||
max_queue_len: u8,
|
||||
}
|
||||
|
||||
impl TexQueueSimulationState {
|
||||
pub fn new() -> Self {
|
||||
TexQueueSimulationState {
|
||||
queue_pos: Default::default(),
|
||||
max_queue_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Translate from RegRef to bucket_index + bucket_range
|
||||
#[inline]
|
||||
fn reg_ref_to_coords(reg: RegRef) -> (u8, Range<usize>) {
|
||||
debug_assert!(reg.base_idx() <= u8::MAX.into());
|
||||
let idx = reg.base_idx() as u8 / 4;
|
||||
let sub = (reg.base_idx() % 4) as usize;
|
||||
|
||||
let range = sub..(sub + reg.comps() as usize);
|
||||
assert!(range.end <= 4);
|
||||
(idx, range)
|
||||
}
|
||||
|
||||
fn min_queue_position(&self, reg: RegRef) -> Option<u8> {
|
||||
let (idx, range) = Self::reg_ref_to_coords(reg);
|
||||
|
||||
self.queue_pos
|
||||
.get(&idx)
|
||||
.and_then(|x| x.min_queue_position(range))
|
||||
}
|
||||
|
||||
fn is_queue_full(&self) -> bool {
|
||||
// MAX_TEXTURES_LEFT describes the maximum number encodable
|
||||
// in the texdepbar, but the queue must have an element more.
|
||||
self.max_queue_len > OpTexDepBar::MAX_TEXTURES_LEFT
|
||||
}
|
||||
|
||||
/// Flush every element whose position >= pos
|
||||
///
|
||||
/// Effectively simulates the execution of a `texdepbar pos`
|
||||
fn flush_after(&mut self, pos: u8) {
|
||||
self.max_queue_len = self.max_queue_len.min(pos);
|
||||
self.queue_pos.retain(|_, v| v.flush_after(pos));
|
||||
}
|
||||
|
||||
pub fn push(&mut self, reg: RegRef) -> Option<u8> {
|
||||
// Assert we are not on the queue
|
||||
debug_assert!(self.min_queue_position(reg).is_none());
|
||||
|
||||
// Check that the push operation does not overflow the queue,
|
||||
// if it does, we must insert a barrier
|
||||
let mut tex_bar = None;
|
||||
if self.is_queue_full() {
|
||||
// The queue is full, there are 64 in-flight tex-ops.
|
||||
// make space by making removing 1 texture.
|
||||
tex_bar = Some(OpTexDepBar::MAX_TEXTURES_LEFT);
|
||||
self.flush_after(OpTexDepBar::MAX_TEXTURES_LEFT);
|
||||
// Now the queue is not full anymore
|
||||
debug_assert!(!self.is_queue_full());
|
||||
}
|
||||
|
||||
self.max_queue_len += 1;
|
||||
// Every entry is pushed by 1
|
||||
for x in self.queue_pos.values_mut() {
|
||||
x.push();
|
||||
}
|
||||
|
||||
// Put us on the queue as first
|
||||
let (idx, range) = Self::reg_ref_to_coords(reg);
|
||||
self.queue_pos
|
||||
.entry(idx)
|
||||
.or_insert(TexQueueSimulationBucket::EMPTY)
|
||||
.set_as_first(range);
|
||||
|
||||
tex_bar
|
||||
}
|
||||
|
||||
pub fn flush(&mut self, reg: RegRef) -> Option<u8> {
|
||||
let queue_pos = self.min_queue_position(reg);
|
||||
|
||||
let Some(queue_pos) = queue_pos else {
|
||||
return None; // Not in queue
|
||||
};
|
||||
|
||||
// Cut the queue
|
||||
self.flush_after(queue_pos);
|
||||
debug_assert!(self.min_queue_position(reg).is_none());
|
||||
|
||||
Some(queue_pos)
|
||||
}
|
||||
|
||||
pub fn merge(&mut self, other: &Self) {
|
||||
self.max_queue_len = self.max_queue_len.max(other.max_queue_len);
|
||||
for (key, y) in other.queue_pos.iter() {
|
||||
let x = self
|
||||
.queue_pos
|
||||
.entry(*key)
|
||||
.or_insert(TexQueueSimulationBucket::EMPTY);
|
||||
x.merge(y);
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulates the execution of an instruction and returns the
|
||||
/// barrier level needed.
|
||||
pub fn visit_instr(&mut self, instr: &Instr) -> Option<u8> {
|
||||
// Flush register reads and writes
|
||||
// (avoid write-after-write and read-after-write hazards)
|
||||
// Compute the minimum required flush level (for barriers)
|
||||
let flush_level = if !self.queue_pos.is_empty() {
|
||||
let src_refs =
|
||||
instr.srcs().iter().filter_map(|x| x.src_ref.as_reg());
|
||||
let dst_refs = instr.dsts().iter().filter_map(|x| x.as_reg());
|
||||
|
||||
src_refs
|
||||
.chain(dst_refs)
|
||||
.filter_map(|reg_ref| self.flush(*reg_ref))
|
||||
.reduce(|a, b| a.min(b))
|
||||
} else {
|
||||
// The queue is empty, no need to check the instruction
|
||||
None
|
||||
};
|
||||
|
||||
// Push registers (if we are a tex instruction)
|
||||
// We might need to insert a barrier if the queue is full
|
||||
let push_level = if instr_needs_texbar(&instr) {
|
||||
let dst = instr.dsts()[0].as_reg().unwrap();
|
||||
self.push(*dst)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If the flush needs a barrier, the queue will not be full,
|
||||
// therefore the push will not need a barrier.
|
||||
debug_assert!(!flush_level.is_some() || !push_level.is_some());
|
||||
flush_level.or(push_level)
|
||||
}
|
||||
}
|
||||
|
||||
fn instr_needs_texbar(instr: &Instr) -> bool {
|
||||
matches!(
|
||||
instr.op,
|
||||
Op::Tex(_)
|
||||
| Op::Tld(_)
|
||||
| Op::Tmml(_)
|
||||
| Op::Tld4(_)
|
||||
| Op::Txd(_)
|
||||
| Op::Txq(_)
|
||||
)
|
||||
}
|
||||
|
||||
/// Hardware has a FIFO queue of texture that are still fetching,
|
||||
/// when the oldest tex finishes executing, it's written to the reg,
|
||||
/// removed from the queue and it begins executing the new one.
|
||||
/// The problem arises when a texture is read while it is still being fetched
|
||||
/// to avoid it, we have a `texdepbar {i}` instruction that stalls until
|
||||
/// the texture fetch queue has at most {i} elements.
|
||||
/// e.g. the most simple solution is to have texdepbar 0 after each texture
|
||||
/// instruction, but this would stall the pipeline until the texture fetch
|
||||
/// finishes executing.
|
||||
/// This algorithm inserts `texdepbar` at each use of the texture results,
|
||||
/// simulating the texture queue execution.
|
||||
///
|
||||
/// Note that the texture queue has for each entry (texture data, register output)
|
||||
/// and each register can be on the queue only once (we don't want to have multiple texture
|
||||
/// operations in-flight that write to the same registers).
|
||||
/// This can lead to a neat algorithm:
|
||||
/// instead of tracking the queue directly, which can exponentially explode in complexity,
|
||||
/// track the position of each register, which needs at most 255/63 positions.
|
||||
/// For branches the state is duplicated in each basic block,
|
||||
/// for joins instead we want to keep both the minimum position of each
|
||||
/// entry and the maximum length og the queue to avoid overflows.
|
||||
///
|
||||
/// TODO: IF this pass is too slow, there are still optimizations left:
|
||||
/// - Our data-flow computes barrier levels and discards them,
|
||||
/// but since most CFG blocks do not need recomputation, we could save
|
||||
/// the barrier levels in a vec and save a pass later.
|
||||
/// - Instead of pushing by 1 each element in the queue on a `push` op,
|
||||
/// we could keep track of an in-flight range and use a wrapping timestamp
|
||||
/// this improves performance but needs careful implementation to avoid bugs
|
||||
fn insert_texture_barriers(f: &mut Function, sm: &dyn ShaderModel) {
|
||||
assert!(sm.is_kepler()); // Only kepler has texture barriers!
|
||||
|
||||
let mut state_in: Vec<_> = (0..f.blocks.len())
|
||||
.map(|_| TexQueueSimulationState::new())
|
||||
.collect();
|
||||
let mut state_out: Vec<_> = (0..f.blocks.len())
|
||||
.map(|_| TexQueueSimulationState::new())
|
||||
.collect();
|
||||
ForwardDataflow {
|
||||
cfg: &f.blocks,
|
||||
block_in: &mut state_in[..],
|
||||
block_out: &mut state_out[..],
|
||||
transfer: |_block_idx, block, sim_out, sim_in| {
|
||||
let mut sim = sim_in.clone();
|
||||
|
||||
for instr in block.instrs.iter() {
|
||||
// Ignore the barrier, we will recompute this later
|
||||
let _bar = sim.visit_instr(&instr);
|
||||
}
|
||||
|
||||
if *sim_out == sim {
|
||||
false
|
||||
} else {
|
||||
*sim_out = sim;
|
||||
true
|
||||
}
|
||||
},
|
||||
join: |sim_out, pred_sim_in| {
|
||||
sim_out.merge(pred_sim_in);
|
||||
},
|
||||
}
|
||||
.solve();
|
||||
|
||||
for (block, mut sim) in f.blocks.iter_mut().zip(state_in.into_iter()) {
|
||||
block.map_instrs(|instr| {
|
||||
if let Some(textures_left) = sim.visit_instr(&instr) {
|
||||
let bar = Instr::new_boxed(OpTexDepBar { textures_left });
|
||||
MappedInstrs::Many(vec![bar, instr])
|
||||
} else {
|
||||
MappedInstrs::One(instr)
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
|
||||
let mut uses = Box::new(RegTracker::new_with(&|| RegUse::None));
|
||||
let mut deps = DepGraph::new();
|
||||
|
|
@ -502,6 +840,12 @@ impl Shader<'_> {
|
|||
}
|
||||
|
||||
pub fn calc_instr_deps(&mut self) {
|
||||
if self.sm.is_kepler() {
|
||||
for f in &mut self.functions {
|
||||
insert_texture_barriers(f, self.sm);
|
||||
}
|
||||
}
|
||||
|
||||
if DEBUG.serial() {
|
||||
self.assign_deps_serial();
|
||||
} else {
|
||||
|
|
@ -524,3 +868,82 @@ impl Shader<'_> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn reg_gpr(range: Range<usize>) -> RegRef {
|
||||
RegRef::new(
|
||||
RegFile::GPR,
|
||||
range.start as u32,
|
||||
(range.end - range.start) as u8,
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_texdepbar_basic() {
|
||||
let mut sim = TexQueueSimulationState::new();
|
||||
|
||||
// RaW
|
||||
assert_eq!(sim.push(reg_gpr(0..4)), None);
|
||||
assert_eq!(sim.flush(reg_gpr(2..3)), Some(0));
|
||||
|
||||
// 2 entries in the queue
|
||||
assert_eq!(sim.push(reg_gpr(0..2)), None); // [A]
|
||||
assert_eq!(sim.push(reg_gpr(2..4)), None); // [B, A]
|
||||
assert_eq!(sim.flush(reg_gpr(0..1)), Some(1)); // [B]
|
||||
assert_eq!(sim.flush(reg_gpr(3..4)), Some(0)); // []
|
||||
|
||||
// Test bucket conflicts
|
||||
assert_eq!(sim.push(reg_gpr(0..1)), None);
|
||||
assert_eq!(sim.flush(reg_gpr(1..3)), None);
|
||||
assert_eq!(sim.flush(reg_gpr(0..3)), Some(0));
|
||||
|
||||
// Bucket conflict part 2: Electric Boogaloo
|
||||
assert_eq!(sim.push(reg_gpr(1..2)), None);
|
||||
assert_eq!(sim.push(reg_gpr(0..1)), None);
|
||||
assert_eq!(sim.flush(reg_gpr(1..2)), Some(1));
|
||||
assert_eq!(sim.flush(reg_gpr(0..1)), Some(0));
|
||||
|
||||
// Interesting CFG case that the old pass got wrong.
|
||||
// CFG: A -> [B, C] -> D
|
||||
// A pushes
|
||||
assert_eq!(sim.push(reg_gpr(0..4)), None);
|
||||
// B: pushes a tex then flushes it
|
||||
let mut b_sim = sim.clone();
|
||||
assert_eq!(b_sim.push(reg_gpr(4..8)), None);
|
||||
assert_eq!(b_sim.flush(reg_gpr(4..8)), Some(0));
|
||||
// C: pushes 3 tex and never flishes them
|
||||
let mut c_sim = sim.clone();
|
||||
assert_eq!(c_sim.push(reg_gpr(4..5)), None);
|
||||
assert_eq!(c_sim.push(reg_gpr(5..6)), None);
|
||||
assert_eq!(c_sim.push(reg_gpr(6..7)), None);
|
||||
// D: flushes the tex pushed by A
|
||||
let mut d_sim = b_sim;
|
||||
d_sim.merge(&mut c_sim);
|
||||
assert_eq!(c_sim.flush(reg_gpr(0..4)), Some(3));
|
||||
// the "shortest push path" would pass by B but in fact
|
||||
// by passing in B our texture is flushed off the queue.
|
||||
// (old algorithm would insert a texdepbar 1)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_texdepbar_overflow() {
|
||||
let mut sim = TexQueueSimulationState::new();
|
||||
|
||||
// Fill the texture queue
|
||||
for i in 0..(usize::from(OpTexDepBar::MAX_TEXTURES_LEFT) + 1) {
|
||||
assert_eq!(sim.push(reg_gpr(i..(i + 1))), None);
|
||||
}
|
||||
// The new push would overflow the queue, we NEED a barrier
|
||||
assert_eq!(
|
||||
sim.push(reg_gpr(64..65)),
|
||||
Some(OpTexDepBar::MAX_TEXTURES_LEFT)
|
||||
);
|
||||
assert_eq!(
|
||||
sim.push(reg_gpr(65..66)),
|
||||
Some(OpTexDepBar::MAX_TEXTURES_LEFT)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1994,12 +1994,6 @@ impl<'a> ShaderFromNir<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
if self.sm.sm() < 50 {
|
||||
// TODO: texbar should be created by calc_instr_deps() and
|
||||
// should be less conservative than textures_left=0.
|
||||
// See the old pass: NVC0LegalizePostRA::insertTextureBarriers
|
||||
b.push_op(OpTexDepBar { textures_left: 0 });
|
||||
}
|
||||
self.set_ssa(tex.def.as_def(), nir_dst);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6945,7 +6945,15 @@ impl_display_for_op!(OpBar);
|
|||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice)]
|
||||
pub struct OpTexDepBar {
|
||||
pub textures_left: i8,
|
||||
pub textures_left: u8,
|
||||
}
|
||||
|
||||
impl OpTexDepBar {
|
||||
/// Maximum value of textures_left
|
||||
///
|
||||
/// The maximum encodable value is 63. However, nvcc starts emitting
|
||||
/// TEXDEPBAR 0x3e as soon as it hits 62 texture instructions.
|
||||
pub const MAX_TEXTURES_LEFT: u8 = 62;
|
||||
}
|
||||
|
||||
impl DisplayOp for OpTexDepBar {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue