nak: properly copy prop neg/abs float sources for flushed values
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

This allows us to copy prop fadd.ftz -rZ, -|ssa| into consuming
instructions giving us nice gains across the board.

Totals from 1033868 (85.24% of 1212873) affected shaders:
CodeSize: 8813536528 -> 8355226128 (-5.20%); split: -5.21%, +0.01%
Number of GPRs: 44954066 -> 44299483 (-1.46%); split: -1.52%, +0.06%
SLM Size: 799688 -> 798544 (-0.14%)
Static cycle count: 4646939330 -> 4485129185 (-3.48%); split: -3.67%, +0.18%
Spills to memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01%
Fills from memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01%
Spills to reg: 196547 -> 196231 (-0.16%); split: -1.22%, +1.06%
Fills from reg: 201227 -> 200988 (-0.12%); split: -1.00%, +0.88%
Max warps/SM: 44143984 -> 44306960 (+0.37%); split: +0.38%, -0.01%

Reviewed-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40897>
This commit is contained in:
Karol Herbst 2026-04-10 23:47:15 +02:00 committed by Marge Bot
parent 8170f18d9b
commit f3ce8fe90b
2 changed files with 187 additions and 12 deletions

View file

@ -8981,6 +8981,144 @@ impl Instr {
}
Ok(())
}
pub fn ftz(&self) -> bool {
match &self.op {
Op::F2F(op) => op.ftz,
Op::F2I(op) => op.ftz,
Op::FAdd(op) => op.ftz,
Op::FFma(op) => op.ftz,
Op::FMnMx(op) => op.ftz,
Op::FMul(op) => op.ftz,
Op::FRnd(op) => op.ftz,
Op::FSet(op) => op.ftz,
Op::FSetP(op) => op.ftz,
Op::FSwz(op) => op.ftz,
Op::FSwzAdd(op) => op.ftz,
Op::HAdd2(op) => op.ftz,
Op::HFma2(op) => op.ftz,
Op::HMnMx2(op) => op.ftz,
Op::HMul2(op) => op.ftz,
Op::HSet2(op) => op.ftz,
Op::HSetP2(op) => op.ftz,
Op::MuFu(op) => {
op.op_type == FloatType::F32 && op.op != MuFuOp::Tanh
}
Op::Rro(_)
| Op::DAdd(_)
| Op::DFma(_)
| Op::DMnMx(_)
| Op::DMul(_)
| Op::DSetP(_)
| Op::Imma(_)
| Op::Hmma(_)
| Op::Ldsm(_)
| Op::BMsk(_)
| Op::BRev(_)
| Op::Bfe(_)
| Op::Flo(_)
| Op::IAbs(_)
| Op::IAdd2(_)
| Op::IAdd2X(_)
| Op::IAdd3(_)
| Op::IAdd3X(_)
| Op::IDp4(_)
| Op::IMad(_)
| Op::IMad64(_)
| Op::IMul(_)
| Op::IMnMx(_)
| Op::ISetP(_)
| Op::Lea(_)
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::PopC(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
| Op::F2FP(_)
| Op::I2F(_)
| Op::I2I(_)
| Op::Mov(_)
| Op::Movm(_)
| Op::Prmt(_)
| Op::Sel(_)
| Op::Sgxt(_)
| Op::Shfl(_)
| Op::PLop3(_)
| Op::PSetP(_)
| Op::R2UR(_)
| Op::Redux(_)
| Op::Tex(_)
| Op::Tld(_)
| Op::Tld4(_)
| Op::Tmml(_)
| Op::Txd(_)
| Op::Txq(_)
| Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuClamp(_)
| Op::SuBfm(_)
| Op::SuEau(_)
| Op::IMadSp(_)
| Op::SuLdGa(_)
| Op::SuStGa(_)
| Op::Ld(_)
| Op::Ldc(_)
| Op::LdSharedLock(_)
| Op::St(_)
| Op::StSCheckUnlock(_)
| Op::Atom(_)
| Op::AL2P(_)
| Op::ALd(_)
| Op::ASt(_)
| Op::Ipa(_)
| Op::LdTram(_)
| Op::CCtl(_)
| Op::MemBar(_)
| Op::BClear(_)
| Op::BMov(_)
| Op::Break(_)
| Op::BSSy(_)
| Op::BSync(_)
| Op::Bra(_)
| Op::SSy(_)
| Op::Sync(_)
| Op::Brk(_)
| Op::PBk(_)
| Op::Cont(_)
| Op::PCnt(_)
| Op::Exit(_)
| Op::WarpSync(_)
| Op::Bar(_)
| Op::TexDepBar(_)
| Op::CS2R(_)
| Op::Isberd(_)
| Op::Isbewr(_)
| Op::ViLd(_)
| Op::Kill(_)
| Op::Nop(_)
| Op::PixLd(_)
| Op::S2R(_)
| Op::Vote(_)
| Op::Match(_)
| Op::Undef(_)
| Op::SrcBar(_)
| Op::PhiSrcs(_)
| Op::PhiDsts(_)
| Op::Copy(_)
| Op::Pin(_)
| Op::Unpin(_)
| Op::Swap(_)
| Op::ParCopy(_)
| Op::RegOut(_)
| Op::Out(_)
| Op::OutFinal(_)
| Op::Annotate(_) => false,
}
}
}
impl fmt::Display for Instr {

View file

@ -33,6 +33,7 @@ struct CopyEntry {
bi: usize,
src_type: SrcType,
src: Src,
ftz: bool,
}
struct PrmtEntry {
@ -71,10 +72,28 @@ impl<'a> CopyPropPass<'a> {
dst: SSAValue,
src_type: SrcType,
src: Src,
) {
self.add_copy_float(bi, dst, src_type, src, false);
}
fn add_copy_float(
&mut self,
bi: usize,
dst: SSAValue,
src_type: SrcType,
src: Src,
ftz: bool,
) {
assert!(src.src_ref.get_reg().is_none());
self.ssa_map
.insert(dst, CopyPropEntry::Copy(CopyEntry { bi, src_type, src }));
self.ssa_map.insert(
dst,
CopyPropEntry::Copy(CopyEntry {
bi,
src_type,
src,
ftz,
}),
);
}
fn add_b2i(&mut self, _bi: usize, dst: SSAValue, src: Src) {
@ -202,6 +221,10 @@ impl<'a> CopyPropPass<'a> {
continue;
};
if entry.ftz {
continue;
}
if entry.src.is_unmodified() {
if let SrcRef::SSA(entry_ssa) = &entry.src.src_ref {
assert!(entry_ssa.comps() == 1);
@ -281,6 +304,7 @@ impl<'a> CopyPropPass<'a> {
src_type: SrcType,
cbuf_rule: &CBufRule,
src: &mut Src,
instr_ftz: bool,
) {
loop {
let src_ssa = match &src.src_ref {
@ -296,12 +320,19 @@ impl<'a> CopyPropPass<'a> {
match entry {
CopyPropEntry::Copy(entry) => {
// If the original op flushes denorms, but not the
// consumer, we skip the propagation
if entry.ftz && !instr_ftz {
return;
}
if !cbuf_rule.allows_src(entry.bi, &entry.src) {
return;
}
// If there are modifiers, the source types have to match
if !entry.src.is_unmodified()
// If there are modifiers or ftz enabled, the source types
// have to match
if (!entry.src.is_unmodified() || entry.ftz)
&& !entry.src_type.eq_ftz_mod(src_type)
{
return;
@ -490,6 +521,7 @@ impl<'a> CopyPropPass<'a> {
src_type: SrcType,
cbuf_rule: &CBufRule,
src: &mut Src,
instr_ftz: bool,
) {
match src_type {
SrcType::SSA => {
@ -505,7 +537,7 @@ impl<'a> CopyPropPass<'a> {
| SrcType::I32
| SrcType::B32
| SrcType::Pred => {
self.prop_to_scalar_src(src_type, cbuf_rule, src);
self.prop_to_scalar_src(src_type, cbuf_rule, src, instr_ftz);
}
SrcType::F64 => {
self.prop_to_f64_src(cbuf_rule, src);
@ -528,20 +560,22 @@ impl<'a> CopyPropPass<'a> {
assert!(dst.comps() == 1);
let dst = dst[0];
if !add.saturate && !add.ftz {
if !add.saturate {
if add.srcs[0].is_fneg_zero(SrcType::F16v2) {
self.add_copy(
self.add_copy_float(
bi,
dst,
SrcType::F16v2,
add.srcs[1].clone(),
add.ftz,
);
} else if add.srcs[1].is_fneg_zero(SrcType::F16v2) {
self.add_copy(
self.add_copy_float(
bi,
dst,
SrcType::F16v2,
add.srcs[0].clone(),
add.ftz,
);
}
}
@ -551,20 +585,22 @@ impl<'a> CopyPropPass<'a> {
assert!(dst.comps() == 1);
let dst = dst[0];
if !add.saturate && !add.ftz {
if !add.saturate {
if add.srcs[0].is_fneg_zero(SrcType::F32) {
self.add_copy(
self.add_copy_float(
bi,
dst,
SrcType::F32,
add.srcs[1].clone(),
add.ftz,
);
} else if add.srcs[1].is_fneg_zero(SrcType::F32) {
self.add_copy(
self.add_copy_float(
bi,
dst,
SrcType::F32,
add.srcs[0].clone(),
add.ftz,
);
}
}
@ -814,6 +850,7 @@ impl<'a> CopyPropPass<'a> {
};
let src_types = instr.src_types();
let ftz = instr.ftz();
for (i, src) in instr.srcs_mut().iter_mut().enumerate() {
let mut src_type = src_types[i];
if force_alu_src_type {
@ -825,7 +862,7 @@ impl<'a> CopyPropPass<'a> {
_ => panic!("Unhandled src_type"),
};
};
self.prop_to_src(src_type, &cbuf_rule, src);
self.prop_to_src(src_type, &cbuf_rule, src, ftz);
}
}
}