mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 09:18:04 +02:00
nak: properly copy prop neg/abs float sources for flushed values
This allows us to copy prop fadd.ftz -rZ, -|ssa| into consuming instructions giving us nice gains across the board. Totals from 1033868 (85.24% of 1212873) affected shaders: CodeSize: 8813536528 -> 8355226128 (-5.20%); split: -5.21%, +0.01% Number of GPRs: 44954066 -> 44299483 (-1.46%); split: -1.52%, +0.06% SLM Size: 799688 -> 798544 (-0.14%) Static cycle count: 4646939330 -> 4485129185 (-3.48%); split: -3.67%, +0.18% Spills to memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01% Fills from memory: 35405 -> 33136 (-6.41%); split: -6.41%, +0.01% Spills to reg: 196547 -> 196231 (-0.16%); split: -1.22%, +1.06% Fills from reg: 201227 -> 200988 (-0.12%); split: -1.00%, +0.88% Max warps/SM: 44143984 -> 44306960 (+0.37%); split: +0.38%, -0.01% Reviewed-by: Mary Guillemard <mary@mary.zone> Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40897>
This commit is contained in:
parent
8170f18d9b
commit
f3ce8fe90b
2 changed files with 187 additions and 12 deletions
|
|
@ -8981,6 +8981,144 @@ impl Instr {
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn ftz(&self) -> bool {
|
||||
match &self.op {
|
||||
Op::F2F(op) => op.ftz,
|
||||
Op::F2I(op) => op.ftz,
|
||||
Op::FAdd(op) => op.ftz,
|
||||
Op::FFma(op) => op.ftz,
|
||||
Op::FMnMx(op) => op.ftz,
|
||||
Op::FMul(op) => op.ftz,
|
||||
Op::FRnd(op) => op.ftz,
|
||||
Op::FSet(op) => op.ftz,
|
||||
Op::FSetP(op) => op.ftz,
|
||||
Op::FSwz(op) => op.ftz,
|
||||
Op::FSwzAdd(op) => op.ftz,
|
||||
Op::HAdd2(op) => op.ftz,
|
||||
Op::HFma2(op) => op.ftz,
|
||||
Op::HMnMx2(op) => op.ftz,
|
||||
Op::HMul2(op) => op.ftz,
|
||||
Op::HSet2(op) => op.ftz,
|
||||
Op::HSetP2(op) => op.ftz,
|
||||
Op::MuFu(op) => {
|
||||
op.op_type == FloatType::F32 && op.op != MuFuOp::Tanh
|
||||
}
|
||||
|
||||
Op::Rro(_)
|
||||
| Op::DAdd(_)
|
||||
| Op::DFma(_)
|
||||
| Op::DMnMx(_)
|
||||
| Op::DMul(_)
|
||||
| Op::DSetP(_)
|
||||
| Op::Imma(_)
|
||||
| Op::Hmma(_)
|
||||
| Op::Ldsm(_)
|
||||
| Op::BMsk(_)
|
||||
| Op::BRev(_)
|
||||
| Op::Bfe(_)
|
||||
| Op::Flo(_)
|
||||
| Op::IAbs(_)
|
||||
| Op::IAdd2(_)
|
||||
| Op::IAdd2X(_)
|
||||
| Op::IAdd3(_)
|
||||
| Op::IAdd3X(_)
|
||||
| Op::IDp4(_)
|
||||
| Op::IMad(_)
|
||||
| Op::IMad64(_)
|
||||
| Op::IMul(_)
|
||||
| Op::IMnMx(_)
|
||||
| Op::ISetP(_)
|
||||
| Op::Lea(_)
|
||||
| Op::LeaX(_)
|
||||
| Op::Lop2(_)
|
||||
| Op::Lop3(_)
|
||||
| Op::PopC(_)
|
||||
| Op::Shf(_)
|
||||
| Op::Shl(_)
|
||||
| Op::Shr(_)
|
||||
| Op::F2FP(_)
|
||||
| Op::I2F(_)
|
||||
| Op::I2I(_)
|
||||
| Op::Mov(_)
|
||||
| Op::Movm(_)
|
||||
| Op::Prmt(_)
|
||||
| Op::Sel(_)
|
||||
| Op::Sgxt(_)
|
||||
| Op::Shfl(_)
|
||||
| Op::PLop3(_)
|
||||
| Op::PSetP(_)
|
||||
| Op::R2UR(_)
|
||||
| Op::Redux(_)
|
||||
| Op::Tex(_)
|
||||
| Op::Tld(_)
|
||||
| Op::Tld4(_)
|
||||
| Op::Tmml(_)
|
||||
| Op::Txd(_)
|
||||
| Op::Txq(_)
|
||||
| Op::SuLd(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuAtom(_)
|
||||
| Op::SuClamp(_)
|
||||
| Op::SuBfm(_)
|
||||
| Op::SuEau(_)
|
||||
| Op::IMadSp(_)
|
||||
| Op::SuLdGa(_)
|
||||
| Op::SuStGa(_)
|
||||
| Op::Ld(_)
|
||||
| Op::Ldc(_)
|
||||
| Op::LdSharedLock(_)
|
||||
| Op::St(_)
|
||||
| Op::StSCheckUnlock(_)
|
||||
| Op::Atom(_)
|
||||
| Op::AL2P(_)
|
||||
| Op::ALd(_)
|
||||
| Op::ASt(_)
|
||||
| Op::Ipa(_)
|
||||
| Op::LdTram(_)
|
||||
| Op::CCtl(_)
|
||||
| Op::MemBar(_)
|
||||
| Op::BClear(_)
|
||||
| Op::BMov(_)
|
||||
| Op::Break(_)
|
||||
| Op::BSSy(_)
|
||||
| Op::BSync(_)
|
||||
| Op::Bra(_)
|
||||
| Op::SSy(_)
|
||||
| Op::Sync(_)
|
||||
| Op::Brk(_)
|
||||
| Op::PBk(_)
|
||||
| Op::Cont(_)
|
||||
| Op::PCnt(_)
|
||||
| Op::Exit(_)
|
||||
| Op::WarpSync(_)
|
||||
| Op::Bar(_)
|
||||
| Op::TexDepBar(_)
|
||||
| Op::CS2R(_)
|
||||
| Op::Isberd(_)
|
||||
| Op::Isbewr(_)
|
||||
| Op::ViLd(_)
|
||||
| Op::Kill(_)
|
||||
| Op::Nop(_)
|
||||
| Op::PixLd(_)
|
||||
| Op::S2R(_)
|
||||
| Op::Vote(_)
|
||||
| Op::Match(_)
|
||||
| Op::Undef(_)
|
||||
| Op::SrcBar(_)
|
||||
| Op::PhiSrcs(_)
|
||||
| Op::PhiDsts(_)
|
||||
| Op::Copy(_)
|
||||
| Op::Pin(_)
|
||||
| Op::Unpin(_)
|
||||
| Op::Swap(_)
|
||||
| Op::ParCopy(_)
|
||||
| Op::RegOut(_)
|
||||
| Op::Out(_)
|
||||
| Op::OutFinal(_)
|
||||
| Op::Annotate(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Instr {
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ struct CopyEntry {
|
|||
bi: usize,
|
||||
src_type: SrcType,
|
||||
src: Src,
|
||||
ftz: bool,
|
||||
}
|
||||
|
||||
struct PrmtEntry {
|
||||
|
|
@ -71,10 +72,28 @@ impl<'a> CopyPropPass<'a> {
|
|||
dst: SSAValue,
|
||||
src_type: SrcType,
|
||||
src: Src,
|
||||
) {
|
||||
self.add_copy_float(bi, dst, src_type, src, false);
|
||||
}
|
||||
|
||||
fn add_copy_float(
|
||||
&mut self,
|
||||
bi: usize,
|
||||
dst: SSAValue,
|
||||
src_type: SrcType,
|
||||
src: Src,
|
||||
ftz: bool,
|
||||
) {
|
||||
assert!(src.src_ref.get_reg().is_none());
|
||||
self.ssa_map
|
||||
.insert(dst, CopyPropEntry::Copy(CopyEntry { bi, src_type, src }));
|
||||
self.ssa_map.insert(
|
||||
dst,
|
||||
CopyPropEntry::Copy(CopyEntry {
|
||||
bi,
|
||||
src_type,
|
||||
src,
|
||||
ftz,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
fn add_b2i(&mut self, _bi: usize, dst: SSAValue, src: Src) {
|
||||
|
|
@ -202,6 +221,10 @@ impl<'a> CopyPropPass<'a> {
|
|||
continue;
|
||||
};
|
||||
|
||||
if entry.ftz {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.src.is_unmodified() {
|
||||
if let SrcRef::SSA(entry_ssa) = &entry.src.src_ref {
|
||||
assert!(entry_ssa.comps() == 1);
|
||||
|
|
@ -281,6 +304,7 @@ impl<'a> CopyPropPass<'a> {
|
|||
src_type: SrcType,
|
||||
cbuf_rule: &CBufRule,
|
||||
src: &mut Src,
|
||||
instr_ftz: bool,
|
||||
) {
|
||||
loop {
|
||||
let src_ssa = match &src.src_ref {
|
||||
|
|
@ -296,12 +320,19 @@ impl<'a> CopyPropPass<'a> {
|
|||
|
||||
match entry {
|
||||
CopyPropEntry::Copy(entry) => {
|
||||
// If the original op flushes denorms, but not the
|
||||
// consumer, we skip the propagation
|
||||
if entry.ftz && !instr_ftz {
|
||||
return;
|
||||
}
|
||||
|
||||
if !cbuf_rule.allows_src(entry.bi, &entry.src) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If there are modifiers, the source types have to match
|
||||
if !entry.src.is_unmodified()
|
||||
// If there are modifiers or ftz enabled, the source types
|
||||
// have to match
|
||||
if (!entry.src.is_unmodified() || entry.ftz)
|
||||
&& !entry.src_type.eq_ftz_mod(src_type)
|
||||
{
|
||||
return;
|
||||
|
|
@ -490,6 +521,7 @@ impl<'a> CopyPropPass<'a> {
|
|||
src_type: SrcType,
|
||||
cbuf_rule: &CBufRule,
|
||||
src: &mut Src,
|
||||
instr_ftz: bool,
|
||||
) {
|
||||
match src_type {
|
||||
SrcType::SSA => {
|
||||
|
|
@ -505,7 +537,7 @@ impl<'a> CopyPropPass<'a> {
|
|||
| SrcType::I32
|
||||
| SrcType::B32
|
||||
| SrcType::Pred => {
|
||||
self.prop_to_scalar_src(src_type, cbuf_rule, src);
|
||||
self.prop_to_scalar_src(src_type, cbuf_rule, src, instr_ftz);
|
||||
}
|
||||
SrcType::F64 => {
|
||||
self.prop_to_f64_src(cbuf_rule, src);
|
||||
|
|
@ -528,20 +560,22 @@ impl<'a> CopyPropPass<'a> {
|
|||
assert!(dst.comps() == 1);
|
||||
let dst = dst[0];
|
||||
|
||||
if !add.saturate && !add.ftz {
|
||||
if !add.saturate {
|
||||
if add.srcs[0].is_fneg_zero(SrcType::F16v2) {
|
||||
self.add_copy(
|
||||
self.add_copy_float(
|
||||
bi,
|
||||
dst,
|
||||
SrcType::F16v2,
|
||||
add.srcs[1].clone(),
|
||||
add.ftz,
|
||||
);
|
||||
} else if add.srcs[1].is_fneg_zero(SrcType::F16v2) {
|
||||
self.add_copy(
|
||||
self.add_copy_float(
|
||||
bi,
|
||||
dst,
|
||||
SrcType::F16v2,
|
||||
add.srcs[0].clone(),
|
||||
add.ftz,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -551,20 +585,22 @@ impl<'a> CopyPropPass<'a> {
|
|||
assert!(dst.comps() == 1);
|
||||
let dst = dst[0];
|
||||
|
||||
if !add.saturate && !add.ftz {
|
||||
if !add.saturate {
|
||||
if add.srcs[0].is_fneg_zero(SrcType::F32) {
|
||||
self.add_copy(
|
||||
self.add_copy_float(
|
||||
bi,
|
||||
dst,
|
||||
SrcType::F32,
|
||||
add.srcs[1].clone(),
|
||||
add.ftz,
|
||||
);
|
||||
} else if add.srcs[1].is_fneg_zero(SrcType::F32) {
|
||||
self.add_copy(
|
||||
self.add_copy_float(
|
||||
bi,
|
||||
dst,
|
||||
SrcType::F32,
|
||||
add.srcs[0].clone(),
|
||||
add.ftz,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -814,6 +850,7 @@ impl<'a> CopyPropPass<'a> {
|
|||
};
|
||||
|
||||
let src_types = instr.src_types();
|
||||
let ftz = instr.ftz();
|
||||
for (i, src) in instr.srcs_mut().iter_mut().enumerate() {
|
||||
let mut src_type = src_types[i];
|
||||
if force_alu_src_type {
|
||||
|
|
@ -825,7 +862,7 @@ impl<'a> CopyPropPass<'a> {
|
|||
_ => panic!("Unhandled src_type"),
|
||||
};
|
||||
};
|
||||
self.prop_to_src(src_type, &cbuf_rule, src);
|
||||
self.prop_to_src(src_type, &cbuf_rule, src, ftz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue