nak: Add latency helpers to ShaderModel and use them

For now, these all just call into sched_common.rs but this gives us the
interface we really want going forward.

Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34302>
This commit is contained in:
Faith Ekstrand 2025-03-31 12:31:23 -05:00 committed by Marge Bot
parent 3112fbcc56
commit 8e2e1e43fa
3 changed files with 70 additions and 52 deletions

View file

@ -3,7 +3,7 @@
use crate::api::{GetDebugFlags, DEBUG};
use crate::ir::*;
use crate::sched_common::*;
use crate::sched_common::RegTracker;
use std::cmp::max;
use std::collections::{HashMap, HashSet};
@ -257,7 +257,7 @@ fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
waits.extend_from_slice(u.deps());
});
if instr.needs_scoreboard(sm.sm()) {
if sm.op_needs_scoreboard(&instr.op) {
let (rd, wr) = deps.add_instr(bi, ip);
uses.for_each_instr_src_mut(instr, |_, u| {
// Only mark a dep as signaled if we actually have
@ -314,7 +314,7 @@ fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
instr.deps.set_yield(true);
}
if !instr.needs_scoreboard(sm.sm()) {
if !sm.op_needs_scoreboard(&instr.op) {
continue;
}
@ -360,7 +360,7 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
for ip in (0..b.instrs.len()).rev() {
let instr = &b.instrs[ip];
let mut min_start = cycle + exec_latency(sm.sm(), &instr.op);
let mut min_start = cycle + sm.exec_latency(&instr.op);
if let Some(bar) = instr.deps.rd_bar() {
min_start = max(min_start, bars[usize::from(bar)] + 2);
}
@ -372,13 +372,12 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
// We don't know how it will be used but it may be used in
// the next block so we need at least assume the maximum
// destination latency from the end of the block.
let s = instr_latency(sm.sm(), &instr.op, i);
let s = sm.worst_latency(&instr.op, i);
min_start = max(min_start, s);
}
RegUse::Write((w_ip, w_dst_idx)) => {
let s = instr_cycle[*w_ip]
+ waw_latency(
sm.sm(),
+ sm.waw_latency(
&instr.op,
i,
&b.instrs[*w_ip].op,
@ -390,10 +389,9 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
for (r_ip, r_src_idx) in reads {
let c = instr_cycle[*r_ip];
let s = if *r_src_idx == usize::MAX {
c + paw_latency(sm.sm(), &instr.op, i)
c + sm.paw_latency(&instr.op, i)
} else {
c + raw_latency(
sm.sm(),
c + sm.raw_latency(
&instr.op,
i,
&b.instrs[*r_ip].op,
@ -408,8 +406,7 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
RegUse::None => (),
RegUse::Write((w_ip, w_dst_idx)) => {
let s = instr_cycle[*w_ip]
+ war_latency(
sm.sm(),
+ sm.war_latency(
&instr.op,
i,
&b.instrs[*w_ip].op,
@ -458,7 +455,7 @@ fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) -> u32 {
if matches!(instr.op, Op::SrcBar(_)) {
instr.op = Op::Nop(OpNop { label: None });
MappedInstrs::One(instr)
} else if exec_latency(sm.sm(), &instr.op) > 1 {
} else if sm.exec_latency(&instr.op) > 1 {
let mut nop = Instr::new_boxed(OpNop { label: None });
nop.deps.set_delay(2);
MappedInstrs::Many(vec![instr, nop])

View file

@ -9,6 +9,7 @@ use nak_bindings::*;
pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
use crate::legalize::LegalizeBuilder;
use crate::sched_common;
use crate::sph::{OutputTopology, PixelImap};
use compiler::as_slice::*;
use compiler::cfg::CFG;
@ -7002,14 +7003,6 @@ impl Instr {
}
}
pub fn has_fixed_latency(&self, sm: u8) -> bool {
self.op.has_fixed_latency(sm)
}
pub fn needs_scoreboard(&self, sm: u8) -> bool {
!self.has_fixed_latency(sm)
}
pub fn needs_yield(&self) -> bool {
matches!(&self.op, Op::Bar(_) | Op::BSync(_))
}
@ -7507,6 +7500,53 @@ pub trait ShaderModel {
fn op_can_be_uniform(&self, op: &Op) -> bool;
// Scheduling information
fn op_needs_scoreboard(&self, op: &Op) -> bool {
!op.has_fixed_latency(self.sm())
}
fn exec_latency(&self, op: &Op) -> u32 {
sched_common::exec_latency(self.sm(), op)
}
fn raw_latency(
&self,
write: &Op,
dst_idx: usize,
read: &Op,
src_idx: usize,
) -> u32 {
sched_common::raw_latency(self.sm(), write, dst_idx, read, src_idx)
}
fn war_latency(
&self,
read: &Op,
src_idx: usize,
write: &Op,
dst_idx: usize,
) -> u32 {
sched_common::war_latency(self.sm(), read, src_idx, write, dst_idx)
}
fn waw_latency(
&self,
a: &Op,
a_dst_idx: usize,
b: &Op,
b_dst_idx: usize,
) -> u32 {
sched_common::waw_latency(self.sm(), a, a_dst_idx, b, b_dst_idx)
}
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 {
sched_common::paw_latency(self.sm(), write, dst_idx)
}
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
sched_common::instr_latency(self.sm(), write, dst_idx)
}
fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op);
fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>;
}

View file

@ -3,10 +3,7 @@
use crate::ir::*;
use crate::opt_instr_sched_common::*;
use crate::sched_common::{
exec_latency, instr_latency, paw_latency, raw_latency, war_latency,
waw_latency, RegTracker,
};
use crate::sched_common::RegTracker;
use std::cmp::max;
use std::cmp::Reverse;
use std::collections::BinaryHeap;
@ -74,29 +71,18 @@ fn generate_dep_graph(
uses.for_each_instr_dst_mut(instr, |i, u| {
if let Some((w_ip, w_dst_idx)) = u.write {
let latency = waw_latency(
sm.sm(),
&instr.op,
i,
&instrs[w_ip].op,
w_dst_idx,
);
let latency =
sm.waw_latency(&instr.op, i, &instrs[w_ip].op, w_dst_idx);
g.add_edge(ip, w_ip, EdgeLabel { latency });
}
for &(r_ip, r_src_idx) in &u.reads {
let mut latency = if r_src_idx == usize::MAX {
paw_latency(sm.sm(), &instr.op, i)
sm.paw_latency(&instr.op, i)
} else {
raw_latency(
sm.sm(),
&instr.op,
i,
&instrs[r_ip].op,
r_src_idx,
)
sm.raw_latency(&instr.op, i, &instrs[r_ip].op, r_src_idx)
};
if instr.needs_scoreboard(sm.sm()) {
if sm.op_needs_scoreboard(&instr.op) {
latency = max(
latency,
estimate_variable_latency(sm.sm(), &instr.op),
@ -107,13 +93,8 @@ fn generate_dep_graph(
});
uses.for_each_instr_src_mut(instr, |i, u| {
if let Some((w_ip, w_dst_idx)) = u.write {
let latency = war_latency(
sm.sm(),
&instr.op,
i,
&instrs[w_ip].op,
w_dst_idx,
);
let latency =
sm.war_latency(&instr.op, i, &instrs[w_ip].op, w_dst_idx);
g.add_edge(ip, w_ip, EdgeLabel { latency });
}
});
@ -131,16 +112,16 @@ fn generate_dep_graph(
// Initialize this node's distance to the end
let mut ready_cycle = (0..instr.dsts().len())
.map(|i| instr_latency(sm.sm(), &instr.op, i))
.map(|i| sm.worst_latency(&instr.op, i))
.max()
.unwrap_or(0);
if instr.needs_scoreboard(sm.sm()) {
if sm.op_needs_scoreboard(&instr.op) {
let var_latency = estimate_variable_latency(sm.sm(), &instr.op)
+ exec_latency(sm.sm(), &instrs[instrs.len() - 1].op);
+ sm.exec_latency(&instrs[instrs.len() - 1].op);
ready_cycle = max(ready_cycle, var_latency);
}
let label = &mut g.nodes[ip].label;
label.exec_latency = exec_latency(sm.sm(), &instr.op);
label.exec_latency = sm.exec_latency(&instr.op);
label.ready_cycle = ready_cycle;
}