nak: Add a prepass instruction scheduler

Totals:
CodeSize: 5750619392 -> 5817868528 (+1.17%); split: -0.32%, +1.49%
Number of GPRs: 16276896 -> 16342962 (+0.41%); split: -1.00%, +1.41%
SLM Size: 8927212 -> 8739732 (-2.10%); split: -2.59%, +0.49%
Static cycle count: 1497053946 -> 1412275595 (-5.66%); split: -6.00%, +0.33%
Spills to memory: 14248182 -> 14157708 (-0.63%); split: -1.25%, +0.62%
Fills from memory: 14248182 -> 14157708 (-0.63%); split: -1.25%, +0.62%
Spills to reg: 9143000 -> 9042885 (-1.09%); split: -1.22%, +0.13%
Fills from reg: 6892354 -> 6808724 (-1.21%); split: -1.33%, +0.12%
Max warps/SM: 6482016 -> 6567500 (+1.32%); split: +1.40%, -0.08%

Totals from 189431 (96.40% of 196502) affected shaders:
CodeSize: 5739697280 -> 5806946416 (+1.17%); split: -0.32%, +1.50%
Number of GPRs: 16114477 -> 16180543 (+0.41%); split: -1.01%, +1.42%
SLM Size: 8927180 -> 8739700 (-2.10%); split: -2.59%, +0.49%
Static cycle count: 1495006918 -> 1410228567 (-5.67%); split: -6.00%, +0.33%
Spills to memory: 14248182 -> 14157708 (-0.63%); split: -1.25%, +0.62%
Fills from memory: 14248182 -> 14157708 (-0.63%); split: -1.25%, +0.62%
Spills to reg: 9141040 -> 9040925 (-1.10%); split: -1.23%, +0.13%
Fills from reg: 6890401 -> 6806771 (-1.21%); split: -1.34%, +0.12%
Max warps/SM: 6149140 -> 6234624 (+1.39%); split: +1.47%, -0.08%

Reviewed-by: Mary Guillemard <mary@mary.zone>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33306>
This commit is contained in:
Mel Henning 2025-01-30 13:11:00 -05:00 committed by Marge Bot
parent 5caee114ec
commit b55b8da012
5 changed files with 1202 additions and 6 deletions

View file

@ -450,6 +450,7 @@ fn nak_compile_shader_internal(
pass!(s, opt_out);
pass!(s, legalize);
pass!(s, opt_dce);
pass!(s, opt_instr_sched_prepass);
pass!(s, assign_regs);
pass!(s, lower_par_copies);
pass!(s, lower_copy_swap);

View file

@ -327,7 +327,7 @@ impl Iterator for RegFileSet {
///
/// This is used by several passes which need to replicate a data structure
/// per-register-file.
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct PerRegFile<T> {
per_file: [T; NUM_REG_FILES],
}
@ -8264,6 +8264,167 @@ impl Op {
_ => false,
}
}
pub fn is_virtual(&self) -> bool {
match self {
// Float ALU
Op::F2FP(_)
| Op::FAdd(_)
| Op::FFma(_)
| Op::FMnMx(_)
| Op::FMul(_)
| Op::FSet(_)
| Op::FSetP(_)
| Op::HAdd2(_)
| Op::HFma2(_)
| Op::HMul2(_)
| Op::HSet2(_)
| Op::HSetP2(_)
| Op::HMnMx2(_)
| Op::FSwz(_)
| Op::FSwzAdd(_) => false,
// Multi-function unit
Op::Rro(_) | Op::MuFu(_) => false,
// Double-precision float ALU
Op::DAdd(_)
| Op::DFma(_)
| Op::DMnMx(_)
| Op::DMul(_)
| Op::DSetP(_) => false,
// Matrix Multiply Add
Op::Imma(_) | Op::Hmma(_) | Op::Ldsm(_) | Op::Movm(_) => false,
// Integer ALU
Op::BRev(_)
| Op::Flo(_)
| Op::PopC(_)
| Op::IMad(_)
| Op::IMul(_)
| Op::BMsk(_)
| Op::IAbs(_)
| Op::IAdd2(_)
| Op::IAdd2X(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IDp4(_)
| Op::IMad64(_)
| Op::IMnMx(_)
| Op::ISetP(_)
| Op::Lea(_)
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::SuClamp(_)
| Op::SuBfm(_)
| Op::SuEau(_)
| Op::IMadSp(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
| Op::Bfe(_) => false,
// Conversions
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::I2I(_) | Op::FRnd(_) => {
false
}
// Move ops
Op::Mov(_)
| Op::Prmt(_)
| Op::Sel(_)
| Op::Sgxt(_)
| Op::Shfl(_) => false,
// Predicate ops
Op::PLop3(_) | Op::PSetP(_) => false,
// Uniform ops
Op::R2UR(op) => {
op.src.is_uniform() || op.dst.file() == Some(RegFile::UPred)
}
Op::Redux(_) => false,
// Texture ops
Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_) => false,
// Surface ops
Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => false,
// Memory ops
Op::Ld(_)
| Op::Ldc(_)
| Op::LdSharedLock(_)
| Op::St(_)
| Op::StSCheckUnlock(_)
| Op::Atom(_)
| Op::AL2P(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::Ipa(_)
| Op::CCtl(_)
| Op::LdTram(_)
| Op::MemBar(_) => false,
// Control-flow ops
Op::BClear(_)
| Op::Break(_)
| Op::BSSy(_)
| Op::BSync(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Bra(_)
| Op::Exit(_)
| Op::WarpSync(_) => false,
// Barrier
Op::BMov(_) => false,
// Geometry ops
Op::Out(_) | Op::OutFinal(_) => false,
// Miscellaneous ops
Op::Bar(_)
| Op::TexDepBar(_)
| Op::CS2R(_)
| Op::Isberd(_)
| Op::ViLd(_)
| Op::Kill(_)
| Op::PixLd(_)
| Op::S2R(_)
| Op::Match(_)
| Op::Nop(_)
| Op::Vote(_) => false,
// Virtual ops
Op::Undef(_)
| Op::SrcBar(_)
| Op::PhiSrcs(_)
| Op::PhiDsts(_)
| Op::Copy(_)
| Op::Pin(_)
| Op::Unpin(_)
| Op::Swap(_)
| Op::ParCopy(_)
| Op::RegOut(_)
| Op::Annotate(_) => true,
}
}
}
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
@ -8523,6 +8684,13 @@ impl Instr {
self.op.src_types()
}
pub fn ssa_uses(&self) -> impl Iterator<Item = &SSAValue> {
self.srcs()
.iter()
.flat_map(|src| src.iter_ssa())
.chain(self.pred.pred_ref.iter_ssa())
}
pub fn for_each_ssa_use(&self, mut f: impl FnMut(&SSAValue)) {
for ssa in self.pred.iter_ssa() {
f(ssa);
@ -9435,6 +9603,7 @@ pub fn max_warps_per_sm(sm: &ShaderModelInfo, gprs: u32) -> u32 {
// TODO: Take local_size and shared mem limit into account for compute
let total_regs: u32 = 65536;
// GPRs are allocated in multiples of 8
let gprs = max(gprs, 1);
let gprs = gprs.next_multiple_of(8);
let max_warps = prev_multiple_of((total_regs / 32) / gprs, 4);
min(max_warps, sm.warps_per_sm.into())

View file

@ -18,6 +18,7 @@ mod opt_crs;
mod opt_dce;
mod opt_instr_sched_common;
mod opt_instr_sched_postpass;
mod opt_instr_sched_prepass;
mod opt_jump_thread;
mod opt_lop;
mod opt_out;

View file

@ -8,7 +8,7 @@ use compiler::dataflow::BackwardDataflow;
use rustc_hash::{FxHashMap, FxHashSet};
use std::cmp::{max, min, Ord, Ordering};
#[derive(Clone)]
#[derive(Clone, Default)]
pub struct LiveSet {
live: PerRegFile<u32>,
set: FxHashSet<SSAValue>,
@ -16,10 +16,12 @@ pub struct LiveSet {
impl LiveSet {
pub fn new() -> LiveSet {
LiveSet {
live: Default::default(),
set: Default::default(),
}
Default::default()
}
pub fn clear(&mut self) {
self.live = Default::default();
self.set.clear();
}
pub fn contains(&self, ssa: &SSAValue) -> bool {

File diff suppressed because it is too large Load diff