nak: Re-work Instr::get_latency()

We re-order and re-arrange the whole thing by instruction type.  Also,
instead of returning an Option<u32>, have a has_fixed_latench() method
to check the instruction and then get_dst_latench() to get the latency
from instruction launch to the given destination index being available.
This lets us handle predicates properly which have a different number of
cycles for some reason.  Oh, it's now just as correct as the estimates
in nv50_ir_target_gm107.cpp.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24998>
This commit is contained in:
Faith Ekstrand 2023-09-11 14:18:06 -05:00 committed by Marge Bot
parent 3ad3f1376c
commit a07aa29f61
2 changed files with 78 additions and 40 deletions

View file

@ -231,7 +231,7 @@ fn assign_barriers(f: &mut Function) {
wait_mask &= !(1 << bar);
}
if instr.get_latency().is_some() {
if instr.has_fixed_latency() {
continue;
}
@ -301,10 +301,15 @@ fn calc_delays(f: &mut Function) {
let mut ready = RegTracker::new(0_u32);
for instr in b.instrs.iter_mut().rev() {
let mut min_start = cycle + 1; /* TODO: co-issue */
if let Some(latency) = instr.get_latency() {
ready.for_each_instr_dst_mut(instr, |c| {
min_start = max(min_start, *c + latency);
});
if instr.has_fixed_latency() {
for (idx, dst) in instr.dsts().iter().enumerate() {
if let Dst::Reg(reg) = dst {
let latency = instr.get_dst_latency(idx);
for c in &ready[*reg] {
min_start = max(min_start, *c + latency);
}
}
}
}
let delay = min_start - cycle;

View file

@ -3977,53 +3977,73 @@ impl Instr {
}
}
pub fn get_latency(&self) -> Option<u32> {
pub fn has_fixed_latency(&self) -> bool {
match self.op {
// Float ALU
Op::FAdd(_)
| Op::FFma(_)
| Op::FMnMx(_)
| Op::FMul(_)
| Op::FSet(_)
| Op::FSetP(_)
| Op::MuFu(_)
| Op::DAdd(_)
| Op::IAbs(_)
| Op::FSetP(_) => true,
// Multi-function unit is variable latency
Op::MuFu(_) => false,
// Double-precision float ALU
Op::DAdd(_) => false,
// Integer ALU
Op::Brev(_) | Op::Flo(_) | Op::PopC(_) => false,
Op::IAbs(_)
| Op::INeg(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IMad(_)
| Op::IMad64(_)
| Op::IMnMx(_)
| Op::Lop3(_)
| Op::PLop3(_)
| Op::ISetP(_)
| Op::Shf(_) => Some(6),
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::Mov(_) | Op::FRnd(_) => {
Some(15)
}
Op::Sel(_) => Some(15),
Op::CS2R(_) => None,
Op::S2R(_) => None,
Op::ALd(_) => None,
Op::ASt(_) => Some(15),
Op::Ipa(_) => None,
Op::Tex(_) => None,
Op::Tld(_) => None,
Op::Tld4(_) => None,
Op::Tmml(_) => None,
Op::Txd(_) => None,
Op::Txq(_) => None,
Op::SuLd(_) => None,
Op::SuSt(_) => None,
Op::SuAtom(_) => None,
Op::Ld(_) => None,
Op::Ldc(_) => None,
Op::St(_) => None,
Op::Atom(_) => None,
Op::AtomCas(_) => None,
Op::MemBar(_) => None,
Op::Bar(_) => None,
Op::Bra(_) | Op::Exit(_) => Some(15),
| Op::Lop3(_)
| Op::Shf(_) => true,
// Conversions are variable latency?!?
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::FRnd(_) => false,
// Move ops
Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => true,
// Predicate ops
Op::PLop3(_) => true,
// Texture ops
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => false,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => false,
// Memory ops
Op::Ld(_)
| Op::Ldc(_)
| Op::St(_)
| Op::Atom(_)
| Op::AtomCas(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::Ipa(_)
| Op::MemBar(_) => false,
// Control-flow ops
Op::Bra(_) | Op::Exit(_) => true,
// Miscellaneous ops
Op::Bar(_) | Op::CS2R(_) | Op::S2R(_) => false,
// Virtual ops
Op::Undef(_)
| Op::PhiSrcs(_)
| Op::PhiDsts(_)
@ -4033,7 +4053,20 @@ impl Instr {
| Op::FSOut(_) => {
panic!("Not a hardware opcode")
}
Op::PopC(_) | Op::Brev(_) | Op::Flo(_) | Op::Prmt(_) => Some(15),
}
}
pub fn get_dst_latency(&self, dst_idx: usize) -> u32 {
debug_assert!(self.has_fixed_latency());
let file = match self.dsts()[dst_idx] {
Dst::None => return 0,
Dst::SSA(vec) => vec.file(),
Dst::Reg(reg) => reg.file(),
};
if file.is_predicate() {
13
} else {
6
}
}
}