nak/kepler: Refine instruction scheduling

Previously some KeplerA chips failed various dEQP tests when instruction
scheduling was enabled.
In particular, `memory_model.message_passing` had issues where a
`membar` instruction canceled some in-flight predicate writes, and
`barrier.write_image_tess_control_read_image_compute.image_128_r32_uint`
had issues around the `Cont` instruction.

This patch refines instruction scheduling to better match the output of
nvcc. Fixing the various dEQP failing tests.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13528
Fixes: c35990c4bc ("nak: Add real instruction dependencies for Kepler")
Signed-off-by: Lorenzo Rossi <snowycoder@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36393>
This commit is contained in:
Lorenzo Rossi 2025-08-13 18:24:41 +02:00 committed by Marge Bot
parent 42088cd602
commit 7375dfd56d
3 changed files with 28 additions and 1 deletions

View file

@ -766,6 +766,15 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
uses.for_each_instr_src_mut(instr, |i, c| {
c.add_read((ip, i));
});
// Kepler A membar conflicts with predicate writes
if sm.is_kepler_a() && matches!(&instr.op, Op::MemBar(_)) {
uses.for_each_pred(|c| {
c.add_read((ip, usize::MAX));
});
uses.for_each_carry(|c| {
c.add_read((ip, usize::MAX));
});
}
for (bar, c) in bars.iter_mut().enumerate() {
if instr.deps.wt_bar_mask & (1 << bar) != 0 {
*c = min_start;

View file

@ -83,6 +83,18 @@ impl<T> RegTracker<T> {
}
}
}
pub fn for_each_pred(&mut self, mut f: impl FnMut(&mut T)) {
for p in &mut self.pred[..] {
f(p);
}
}
pub fn for_each_carry(&mut self, mut f: impl FnMut(&mut T)) {
for c in &mut self.carry {
f(c);
}
}
}
impl<T> Index<RegRef> for RegTracker<T> {

View file

@ -13,6 +13,9 @@ pub fn instr_latency(_sm: u8, op: &Op, _dst_idx: usize) -> u32 {
Op::Ld(_) => 24,
Op::ALd(_) => 24,
Op::IMul(_) => 15, // This does not apply to imad, right? right???
Op::ISetP(_) => 13,
Op::PSetP(_) => 13,
Op::IAdd2(o) if !o.carry_out.is_none() => 13,
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
@ -23,7 +26,8 @@ pub fn instr_latency(_sm: u8, op: &Op, _dst_idx: usize) -> u32 {
}
}
pub fn instr_exec_latency(_sm: u8, op: &Op) -> u32 {
pub fn instr_exec_latency(sm: u8, op: &Op) -> u32 {
let is_kepler_a = sm == 30;
match op {
Op::Tex(_)
| Op::Tld(_)
@ -31,6 +35,8 @@ pub fn instr_exec_latency(_sm: u8, op: &Op) -> u32 {
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => 17,
Op::MemBar(_) => 16,
Op::Cont(_) | Op::Brk(_) if is_kepler_a => 5,
Op::Exit(_) => 15,
_ => 1,
}