mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 19:30:11 +01:00
nak: Move latency information into the per-SM files
This is probably a little more code but we're about to add real data for Turing+ so it's better to have things contained like this. Since Volta and earlier will always remain hacks, we might as well have those hacks in the per-SM files rather than pretending we have a general thing in sched_common.rs. Reviewed-by: Dave Airlie <airlied@redhat.com> Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34302>
This commit is contained in:
parent
64ff3e8cb8
commit
24a8795946
4 changed files with 191 additions and 110 deletions
|
|
@ -9,7 +9,6 @@ use nak_bindings::*;
|
|||
|
||||
pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
|
||||
use crate::legalize::LegalizeBuilder;
|
||||
use crate::sched_common;
|
||||
use crate::sph::{OutputTopology, PixelImap};
|
||||
use compiler::as_slice::*;
|
||||
use compiler::cfg::CFG;
|
||||
|
|
@ -7505,45 +7504,41 @@ pub trait ShaderModel {
|
|||
!op.has_fixed_latency(self.sm())
|
||||
}
|
||||
|
||||
/// Latency before another non-NOP can execute
|
||||
fn exec_latency(&self, op: &Op) -> u32;
|
||||
|
||||
/// Read-after-read latency
|
||||
fn raw_latency(
|
||||
&self,
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
read: &Op,
|
||||
src_idx: usize,
|
||||
) -> u32 {
|
||||
sched_common::raw_latency(self.sm(), write, dst_idx, read, src_idx)
|
||||
}
|
||||
) -> u32;
|
||||
|
||||
/// Write-after-read latency
|
||||
fn war_latency(
|
||||
&self,
|
||||
read: &Op,
|
||||
src_idx: usize,
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
) -> u32 {
|
||||
sched_common::war_latency(self.sm(), read, src_idx, write, dst_idx)
|
||||
}
|
||||
) -> u32;
|
||||
|
||||
/// Write-after-write latency
|
||||
fn waw_latency(
|
||||
&self,
|
||||
a: &Op,
|
||||
a_dst_idx: usize,
|
||||
b: &Op,
|
||||
b_dst_idx: usize,
|
||||
) -> u32 {
|
||||
sched_common::waw_latency(self.sm(), a, a_dst_idx, b, b_dst_idx)
|
||||
}
|
||||
) -> u32;
|
||||
|
||||
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
sched_common::paw_latency(self.sm(), write, dst_idx)
|
||||
}
|
||||
/// Predicate read-after-write latency
|
||||
fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32;
|
||||
|
||||
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
sched_common::instr_latency(self.sm(), write, dst_idx)
|
||||
}
|
||||
/// Worst-case access-after-write latency
|
||||
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32;
|
||||
|
||||
fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op);
|
||||
fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>;
|
||||
|
|
|
|||
|
|
@ -5,100 +5,6 @@ use crate::ir::*;
|
|||
|
||||
use std::ops::{Index, IndexMut, Range};
|
||||
|
||||
pub fn instr_latency(sm: u8, op: &Op, dst_idx: usize) -> u32 {
|
||||
let file = match op.dsts_as_slice()[dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
let (gpr_latency, pred_latency) = if sm < 80 {
|
||||
match op {
|
||||
// Double-precision float ALU
|
||||
Op::DAdd(_)
|
||||
| Op::DFma(_)
|
||||
| Op::DMnMx(_)
|
||||
| Op::DMul(_)
|
||||
| Op::DSetP(_)
|
||||
// Half-precision float ALU
|
||||
| Op::HAdd2(_)
|
||||
| Op::HFma2(_)
|
||||
| Op::HMul2(_)
|
||||
| Op::HSet2(_)
|
||||
| Op::HSetP2(_)
|
||||
| Op::HMnMx2(_) => if sm == 70 {
|
||||
// Volta is even slower
|
||||
(13, 15)
|
||||
} else {
|
||||
(13, 14)
|
||||
}
|
||||
_ => (6, 13)
|
||||
}
|
||||
} else {
|
||||
(6, 13)
|
||||
};
|
||||
|
||||
// This is BS and we know it
|
||||
match file {
|
||||
RegFile::GPR => gpr_latency,
|
||||
RegFile::UGPR => 12,
|
||||
RegFile::Pred => pred_latency,
|
||||
RegFile::UPred => 11,
|
||||
RegFile::Bar => 0, // Barriers have a HW scoreboard
|
||||
RegFile::Carry => 6,
|
||||
RegFile::Mem => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Read-after-write latency
|
||||
pub fn raw_latency(
|
||||
sm: u8,
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
) -> u32 {
|
||||
instr_latency(sm, write, dst_idx)
|
||||
}
|
||||
|
||||
/// Write-after-read latency
|
||||
pub fn war_latency(
|
||||
_sm: u8,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
_write: &Op,
|
||||
_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We assume the source gets read in the first 4 cycles. We don't know how
|
||||
// quickly the write will happen. This is all a guess.
|
||||
4
|
||||
}
|
||||
|
||||
/// Write-after-write latency
|
||||
pub fn waw_latency(
|
||||
sm: u8,
|
||||
a: &Op,
|
||||
a_dst_idx: usize,
|
||||
_b: &Op,
|
||||
_b_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We know our latencies are wrong so assume the wrote could happen anywhere
|
||||
// between 0 and instr_latency(a) cycles
|
||||
instr_latency(sm, a, a_dst_idx)
|
||||
}
|
||||
|
||||
/// Predicate read-after-write latency
|
||||
pub fn paw_latency(sm: u8, write: &Op, _dst_idx: usize) -> u32 {
|
||||
if sm == 70 {
|
||||
match write {
|
||||
Op::DSetP(_) | Op::HSetP2(_) => 15,
|
||||
_ => 13,
|
||||
}
|
||||
} else {
|
||||
13
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RegTracker<T> {
|
||||
reg: [T; 255],
|
||||
ureg: [T; 63],
|
||||
|
|
|
|||
|
|
@ -10,6 +10,43 @@ use bitview::*;
|
|||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
|
||||
pub fn instr_latency(_sm: u8, op: &Op, dst_idx: usize) -> u32 {
|
||||
let file = match op.dsts_as_slice()[dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
let (gpr_latency, pred_latency) = match op {
|
||||
// Double-precision float ALU
|
||||
Op::DAdd(_)
|
||||
| Op::DFma(_)
|
||||
| Op::DMnMx(_)
|
||||
| Op::DMul(_)
|
||||
| Op::DSetP(_)
|
||||
// Half-precision float ALU
|
||||
| Op::HAdd2(_)
|
||||
| Op::HFma2(_)
|
||||
| Op::HMul2(_)
|
||||
| Op::HSet2(_)
|
||||
| Op::HSetP2(_)
|
||||
| Op::HMnMx2(_) => {
|
||||
(13, 14)
|
||||
}
|
||||
_ => (6, 13)
|
||||
};
|
||||
|
||||
// This is BS and we know it
|
||||
match file {
|
||||
RegFile::GPR => gpr_latency,
|
||||
RegFile::Pred => pred_latency,
|
||||
RegFile::UGPR | RegFile::UPred => panic!("No uniform registers"),
|
||||
RegFile::Bar => 0, // Barriers have a HW scoreboard
|
||||
RegFile::Carry => 6,
|
||||
RegFile::Mem => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ShaderModel50 {
|
||||
sm: u8,
|
||||
}
|
||||
|
|
@ -75,6 +112,55 @@ impl ShaderModel for ShaderModel50 {
|
|||
}
|
||||
}
|
||||
|
||||
fn raw_latency(
|
||||
&self,
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
) -> u32 {
|
||||
instr_latency(self.sm, write, dst_idx)
|
||||
}
|
||||
|
||||
fn war_latency(
|
||||
&self,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
_write: &Op,
|
||||
_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We assume the source gets read in the first 4 cycles. We don't know
|
||||
// how quickly the write will happen. This is all a guess.
|
||||
4
|
||||
}
|
||||
|
||||
fn waw_latency(
|
||||
&self,
|
||||
a: &Op,
|
||||
a_dst_idx: usize,
|
||||
_b: &Op,
|
||||
_b_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We know our latencies are wrong so assume the wrote could happen
|
||||
// anywhere between 0 and instr_latency(a) cycles
|
||||
instr_latency(self.sm, a, a_dst_idx)
|
||||
}
|
||||
|
||||
fn paw_latency(&self, write: &Op, _dst_idx: usize) -> u32 {
|
||||
if self.sm == 70 {
|
||||
match write {
|
||||
Op::DSetP(_) | Op::HSetP2(_) => 15,
|
||||
_ => 13,
|
||||
}
|
||||
} else {
|
||||
13
|
||||
}
|
||||
}
|
||||
|
||||
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
instr_latency(self.sm, write, dst_idx)
|
||||
}
|
||||
|
||||
fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
|
||||
as_sm50_op_mut(op).legalize(b);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,6 +18,51 @@ impl ShaderModel70 {
|
|||
fn has_uniform_alu(&self) -> bool {
|
||||
self.sm >= 73
|
||||
}
|
||||
|
||||
fn instr_latency(&self, op: &Op, dst_idx: usize) -> u32 {
|
||||
let file = match op.dsts_as_slice()[dst_idx] {
|
||||
Dst::None => return 0,
|
||||
Dst::SSA(vec) => vec.file().unwrap(),
|
||||
Dst::Reg(reg) => reg.file(),
|
||||
};
|
||||
|
||||
let (gpr_latency, pred_latency) = if self.sm < 80 {
|
||||
match op {
|
||||
// Double-precision float ALU
|
||||
Op::DAdd(_)
|
||||
| Op::DFma(_)
|
||||
| Op::DMnMx(_)
|
||||
| Op::DMul(_)
|
||||
| Op::DSetP(_)
|
||||
// Half-precision float ALU
|
||||
| Op::HAdd2(_)
|
||||
| Op::HFma2(_)
|
||||
| Op::HMul2(_)
|
||||
| Op::HSet2(_)
|
||||
| Op::HSetP2(_)
|
||||
| Op::HMnMx2(_) => if self.sm == 70 {
|
||||
// Volta is even slower
|
||||
(13, 15)
|
||||
} else {
|
||||
(13, 14)
|
||||
}
|
||||
_ => (6, 13)
|
||||
}
|
||||
} else {
|
||||
(6, 13)
|
||||
};
|
||||
|
||||
// This is BS and we know it
|
||||
match file {
|
||||
RegFile::GPR => gpr_latency,
|
||||
RegFile::UGPR => 12,
|
||||
RegFile::Pred => pred_latency,
|
||||
RegFile::UPred => 11,
|
||||
RegFile::Bar => 0, // Barriers have a HW scoreboard
|
||||
RegFile::Carry => 6,
|
||||
RegFile::Mem => panic!("Not a register"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShaderModel for ShaderModel70 {
|
||||
|
|
@ -117,6 +162,55 @@ impl ShaderModel for ShaderModel70 {
|
|||
}
|
||||
}
|
||||
|
||||
fn raw_latency(
|
||||
&self,
|
||||
write: &Op,
|
||||
dst_idx: usize,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
) -> u32 {
|
||||
self.instr_latency(write, dst_idx)
|
||||
}
|
||||
|
||||
fn war_latency(
|
||||
&self,
|
||||
_read: &Op,
|
||||
_src_idx: usize,
|
||||
_write: &Op,
|
||||
_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We assume the source gets read in the first 4 cycles. We don't know
|
||||
// how quickly the write will happen. This is all a guess.
|
||||
4
|
||||
}
|
||||
|
||||
fn waw_latency(
|
||||
&self,
|
||||
a: &Op,
|
||||
a_dst_idx: usize,
|
||||
_b: &Op,
|
||||
_b_dst_idx: usize,
|
||||
) -> u32 {
|
||||
// We know our latencies are wrong so assume the wrote could happen
|
||||
// anywhere between 0 and instr_latency(a) cycles
|
||||
self.instr_latency(a, a_dst_idx)
|
||||
}
|
||||
|
||||
fn paw_latency(&self, write: &Op, _dst_idx: usize) -> u32 {
|
||||
if self.sm == 70 {
|
||||
match write {
|
||||
Op::DSetP(_) | Op::HSetP2(_) => 15,
|
||||
_ => 13,
|
||||
}
|
||||
} else {
|
||||
13
|
||||
}
|
||||
}
|
||||
|
||||
fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 {
|
||||
self.instr_latency(write, dst_idx)
|
||||
}
|
||||
|
||||
fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
|
||||
legalize_sm70_op(self, b, op);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue