From f3ce8fe90b93e7b9b898424333c260d9cadc461a Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 10 Apr 2026 23:47:15 +0200 Subject: [PATCH] nak: properly copy prop neg/abs float sources for flushed values This allows us to copy prop fadd.ftz -rZ, -|ssa| into consuming instructions giving us nice gains across the board. Totals from 1033868 (85.24% of 1212873) affected shaders: CodeSize: 8813536528 -> 8355226128 (-5.20%); split: -5.21%, +0.01% Number of GPRs: 44954066 -> 44299483 (-1.46%); split: -1.52%, +0.06% SLM Size: 799688 -> 798544 (-0.14%) Static cycle count: 4646939330 -> 4485129185 (-3.48%); split: -3.67%, +0.18% Spills to memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01% Fills from memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01% Spills to reg: 196547 -> 196231 (-0.16%); split: -1.22%, +1.06% Fills from reg: 201227 -> 200988 (-0.12%); split: -1.00%, +0.88% Max warps/SM: 44143984 -> 44306960 (+0.37%); split: +0.38%, -0.01% Reviewed-by: Mary Guillemard Reviewed-by: Mel Henning Part-of: --- src/nouveau/compiler/nak/ir.rs | 138 ++++++++++++++++++++++ src/nouveau/compiler/nak/opt_copy_prop.rs | 61 ++++++++-- 2 files changed, 187 insertions(+), 12 deletions(-) diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 6daf4a2b54a..f4d5e03abed 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -8981,6 +8981,144 @@ impl Instr { } Ok(()) } + + pub fn ftz(&self) -> bool { + match &self.op { + Op::F2F(op) => op.ftz, + Op::F2I(op) => op.ftz, + Op::FAdd(op) => op.ftz, + Op::FFma(op) => op.ftz, + Op::FMnMx(op) => op.ftz, + Op::FMul(op) => op.ftz, + Op::FRnd(op) => op.ftz, + Op::FSet(op) => op.ftz, + Op::FSetP(op) => op.ftz, + Op::FSwz(op) => op.ftz, + Op::FSwzAdd(op) => op.ftz, + Op::HAdd2(op) => op.ftz, + Op::HFma2(op) => op.ftz, + Op::HMnMx2(op) => op.ftz, + Op::HMul2(op) => op.ftz, + Op::HSet2(op) => op.ftz, + Op::HSetP2(op) => op.ftz, + Op::MuFu(op) => { + op.op_type == FloatType::F32 && op.op != MuFuOp::Tanh + } + + Op::Rro(_) + | Op::DAdd(_) + | Op::DFma(_) + | Op::DMnMx(_) + | Op::DMul(_) + | Op::DSetP(_) + | Op::Imma(_) + | Op::Hmma(_) + | Op::Ldsm(_) + | Op::BMsk(_) + | Op::BRev(_) + | Op::Bfe(_) + | Op::Flo(_) + | Op::IAbs(_) + | Op::IAdd2(_) + | Op::IAdd2X(_) + | Op::IAdd3(_) + | Op::IAdd3X(_) + | Op::IDp4(_) + | Op::IMad(_) + | Op::IMad64(_) + | Op::IMul(_) + | Op::IMnMx(_) + | Op::ISetP(_) + | Op::Lea(_) + | Op::LeaX(_) + | Op::Lop2(_) + | Op::Lop3(_) + | Op::PopC(_) + | Op::Shf(_) + | Op::Shl(_) + | Op::Shr(_) + | Op::F2FP(_) + | Op::I2F(_) + | Op::I2I(_) + | Op::Mov(_) + | Op::Movm(_) + | Op::Prmt(_) + | Op::Sel(_) + | Op::Sgxt(_) + | Op::Shfl(_) + | Op::PLop3(_) + | Op::PSetP(_) + | Op::R2UR(_) + | Op::Redux(_) + | Op::Tex(_) + | Op::Tld(_) + | Op::Tld4(_) + | Op::Tmml(_) + | Op::Txd(_) + | Op::Txq(_) + | Op::SuLd(_) + | Op::SuSt(_) + | Op::SuAtom(_) + | Op::SuClamp(_) + | Op::SuBfm(_) + | Op::SuEau(_) + | Op::IMadSp(_) + | Op::SuLdGa(_) + | Op::SuStGa(_) + | Op::Ld(_) + | Op::Ldc(_) + | Op::LdSharedLock(_) + | Op::St(_) + | Op::StSCheckUnlock(_) + | Op::Atom(_) + | Op::AL2P(_) + | Op::ALd(_) + | Op::ASt(_) + | Op::Ipa(_) + | Op::LdTram(_) + | Op::CCtl(_) + | Op::MemBar(_) + | Op::BClear(_) + | Op::BMov(_) + | Op::Break(_) + | Op::BSSy(_) + | Op::BSync(_) + | Op::Bra(_) + | Op::SSy(_) + | Op::Sync(_) + | Op::Brk(_) + | Op::PBk(_) + | Op::Cont(_) + | Op::PCnt(_) + | Op::Exit(_) + | Op::WarpSync(_) + | Op::Bar(_) + | Op::TexDepBar(_) + | Op::CS2R(_) + | Op::Isberd(_) + | Op::Isbewr(_) + | Op::ViLd(_) + | Op::Kill(_) + | Op::Nop(_) + | Op::PixLd(_) + | Op::S2R(_) + | Op::Vote(_) + | Op::Match(_) + | Op::Undef(_) + | Op::SrcBar(_) + | Op::PhiSrcs(_) + | Op::PhiDsts(_) + | Op::Copy(_) + | Op::Pin(_) + | Op::Unpin(_) + | Op::Swap(_) + | Op::ParCopy(_) + | Op::RegOut(_) + | Op::Out(_) + | Op::OutFinal(_) + | Op::Annotate(_) => false, + } + } } impl fmt::Display for Instr { diff --git a/src/nouveau/compiler/nak/opt_copy_prop.rs b/src/nouveau/compiler/nak/opt_copy_prop.rs index e1e2e3594bd..f6922288019 100644 --- a/src/nouveau/compiler/nak/opt_copy_prop.rs +++ b/src/nouveau/compiler/nak/opt_copy_prop.rs @@ -33,6 +33,7 @@ struct CopyEntry { bi: usize, src_type: SrcType, src: Src, + ftz: bool, } struct PrmtEntry { @@ -71,10 +72,28 @@ impl<'a> CopyPropPass<'a> { dst: SSAValue, src_type: SrcType, src: Src, + ) { + self.add_copy_float(bi, dst, src_type, src, false); + } + + fn add_copy_float( + &mut self, + bi: usize, + dst: SSAValue, + src_type: SrcType, + src: Src, + ftz: bool, ) { assert!(src.src_ref.get_reg().is_none()); - self.ssa_map - .insert(dst, CopyPropEntry::Copy(CopyEntry { bi, src_type, src })); + self.ssa_map.insert( + dst, + CopyPropEntry::Copy(CopyEntry { + bi, + src_type, + src, + ftz, + }), + ); } fn add_b2i(&mut self, _bi: usize, dst: SSAValue, src: Src) { @@ -202,6 +221,10 @@ impl<'a> CopyPropPass<'a> { continue; }; + if entry.ftz { + continue; + } + if entry.src.is_unmodified() { if let SrcRef::SSA(entry_ssa) = &entry.src.src_ref { assert!(entry_ssa.comps() == 1); @@ -281,6 +304,7 @@ impl<'a> CopyPropPass<'a> { src_type: SrcType, cbuf_rule: &CBufRule, src: &mut Src, + instr_ftz: bool, ) { loop { let src_ssa = match &src.src_ref { @@ -296,12 +320,19 @@ impl<'a> CopyPropPass<'a> { match entry { CopyPropEntry::Copy(entry) => { + // If the original op flushes denorms, but not the + // consumer, we skip the propagation + if entry.ftz && !instr_ftz { + return; + } + if !cbuf_rule.allows_src(entry.bi, &entry.src) { return; } - // If there are modifiers, the source types have to match - if !entry.src.is_unmodified() + // If there are modifiers or ftz enabled, the source types + // have to match + if (!entry.src.is_unmodified() || entry.ftz) && !entry.src_type.eq_ftz_mod(src_type) { return; @@ -490,6 +521,7 @@ impl<'a> CopyPropPass<'a> { src_type: SrcType, cbuf_rule: &CBufRule, src: &mut Src, + instr_ftz: bool, ) { match src_type { SrcType::SSA => { @@ -505,7 +537,7 @@ impl<'a> CopyPropPass<'a> { | SrcType::I32 | SrcType::B32 | SrcType::Pred => { - self.prop_to_scalar_src(src_type, cbuf_rule, src); + self.prop_to_scalar_src(src_type, cbuf_rule, src, instr_ftz); } SrcType::F64 => { self.prop_to_f64_src(cbuf_rule, src); @@ -528,20 +560,22 @@ impl<'a> CopyPropPass<'a> { assert!(dst.comps() == 1); let dst = dst[0]; - if !add.saturate && !add.ftz { + if !add.saturate { if add.srcs[0].is_fneg_zero(SrcType::F16v2) { - self.add_copy( + self.add_copy_float( bi, dst, SrcType::F16v2, add.srcs[1].clone(), + add.ftz, ); } else if add.srcs[1].is_fneg_zero(SrcType::F16v2) { - self.add_copy( + self.add_copy_float( bi, dst, SrcType::F16v2, add.srcs[0].clone(), + add.ftz, ); } } @@ -551,20 +585,22 @@ impl<'a> CopyPropPass<'a> { assert!(dst.comps() == 1); let dst = dst[0]; - if !add.saturate && !add.ftz { + if !add.saturate { if add.srcs[0].is_fneg_zero(SrcType::F32) { - self.add_copy( + self.add_copy_float( bi, dst, SrcType::F32, add.srcs[1].clone(), + add.ftz, ); } else if add.srcs[1].is_fneg_zero(SrcType::F32) { - self.add_copy( + self.add_copy_float( bi, dst, SrcType::F32, add.srcs[0].clone(), + add.ftz, ); } } @@ -814,6 +850,7 @@ impl<'a> CopyPropPass<'a> { }; let src_types = instr.src_types(); + let ftz = instr.ftz(); for (i, src) in instr.srcs_mut().iter_mut().enumerate() { let mut src_type = src_types[i]; if force_alu_src_type { @@ -825,7 +862,7 @@ impl<'a> CopyPropPass<'a> { _ => panic!("Unhandled src_type"), }; }; - self.prop_to_src(src_type, &cbuf_rule, src); + self.prop_to_src(src_type, &cbuf_rule, src, ftz); } } }