From ddbf2ec8839b1ffa1ebd665a9b9bc8db3b810357 Mon Sep 17 00:00:00 2001 From: Lorenzo Rossi Date: Thu, 17 Apr 2025 21:43:19 -0500 Subject: [PATCH] nak: Add a new OpFSwz and use it for derivatives on Kepler Part-of: --- src/nouveau/compiler/nak/from_nir.rs | 123 +++++++++++------- src/nouveau/compiler/nak/ir.rs | 76 +++++++++++ .../compiler/nak/opt_instr_sched_common.rs | 1 + 3 files changed, 155 insertions(+), 45 deletions(-) diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index 918d527ba67..40921d90c29 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2293,31 +2293,48 @@ impl<'a> ShaderFromNir<'a> { assert!(intrin.def.bit_size() == 32); let ftype = FloatType::F32; - let scratch = b.alloc_ssa(RegFile::GPR, 1); - - b.push_op(OpShfl { - dst: scratch[0].into(), - in_bounds: Dst::None, - src: self.get_src(&srcs[0]), - lane: 1_u32.into(), - c: (0x3_u32 | 0x1c_u32 << 8).into(), - op: ShflOp::Bfly, - }); let dst = b.alloc_ssa(RegFile::GPR, 1); - b.push_op(OpFSwzAdd { - dst: dst[0].into(), - srcs: [scratch[0].into(), self.get_src(&srcs[0])], - ops: [ - FSwzAddOp::SubLeft, - FSwzAddOp::SubRight, - FSwzAddOp::SubLeft, - FSwzAddOp::SubRight, - ], - rnd_mode: self.float_ctl[ftype].rnd_mode, - ftz: self.float_ctl[ftype].ftz, - }); + if self.sm.sm() >= 50 { + let scratch = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpShfl { + dst: scratch[0].into(), + in_bounds: Dst::None, + src: self.get_src(&srcs[0]), + lane: 1_u32.into(), + c: (0x3_u32 | 0x1c_u32 << 8).into(), + op: ShflOp::Bfly, + }); + + b.push_op(OpFSwzAdd { + dst: dst[0].into(), + srcs: [scratch[0].into(), self.get_src(&srcs[0])], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + ], + rnd_mode: self.float_ctl[ftype].rnd_mode, + ftz: self.float_ctl[ftype].ftz, + }); + } else { + b.push_op(OpFSwz { + dst: dst[0].into(), + srcs: [self.get_src(&srcs[0]), self.get_src(&srcs[0])], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + ], + rnd_mode: self.float_ctl[ftype].rnd_mode, + ftz: self.float_ctl[ftype].ftz, + shuffle: FSwzShuffle::SwapHorizontal, + }); + } self.set_dst(&intrin.def, dst); } @@ -2328,31 +2345,47 @@ impl<'a> ShaderFromNir<'a> { assert!(intrin.def.bit_size() == 32); let ftype = FloatType::F32; - let scratch = b.alloc_ssa(RegFile::GPR, 1); - - b.push_op(OpShfl { - dst: scratch[0].into(), - in_bounds: Dst::None, - src: self.get_src(&srcs[0]), - lane: 2_u32.into(), - c: (0x3_u32 | 0x1c_u32 << 8).into(), - op: ShflOp::Bfly, - }); - let dst = b.alloc_ssa(RegFile::GPR, 1); - b.push_op(OpFSwzAdd { - dst: dst[0].into(), - srcs: [scratch[0].into(), self.get_src(&srcs[0])], - ops: [ - FSwzAddOp::SubLeft, - FSwzAddOp::SubLeft, - FSwzAddOp::SubRight, - FSwzAddOp::SubRight, - ], - rnd_mode: self.float_ctl[ftype].rnd_mode, - ftz: self.float_ctl[ftype].ftz, - }); + if self.sm.sm() >= 50 { + let scratch = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpShfl { + dst: scratch[0].into(), + in_bounds: Dst::None, + src: self.get_src(&srcs[0]), + lane: 2_u32.into(), + c: (0x3_u32 | 0x1c_u32 << 8).into(), + op: ShflOp::Bfly, + }); + + b.push_op(OpFSwzAdd { + dst: dst[0].into(), + srcs: [scratch[0].into(), self.get_src(&srcs[0])], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubRight, + ], + rnd_mode: self.float_ctl[ftype].rnd_mode, + ftz: self.float_ctl[ftype].ftz, + }); + } else { + b.push_op(OpFSwz { + dst: dst[0].into(), + srcs: [self.get_src(&srcs[0]), self.get_src(&srcs[0])], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubRight, + ], + rnd_mode: self.float_ctl[ftype].rnd_mode, + ftz: self.float_ctl[ftype].ftz, + shuffle: FSwzShuffle::SwapVertical, + }); + } self.set_dst(&intrin.def, dst); } diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index 7befadd74f2..d817f825a59 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -2869,6 +2869,80 @@ impl DisplayOp for OpFSwzAdd { } impl_display_for_op!(OpFSwzAdd); +/// Describes where the second src is taken before doing the ops +#[allow(dead_code)] +#[derive(Clone, Copy, Eq, PartialEq)] +pub enum FSwzShuffle { + Quad0, + Quad1, + Quad2, + Quad3, + // swap [0, 1] and [2, 3] + SwapHorizontal, + // swap [0, 2] and [1, 3] + SwapVertical, +} + +impl fmt::Display for FSwzShuffle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + FSwzShuffle::Quad0 => write!(f, ".0000"), + FSwzShuffle::Quad1 => write!(f, ".1111"), + FSwzShuffle::Quad2 => write!(f, ".2222"), + FSwzShuffle::Quad3 => write!(f, ".3333"), + FSwzShuffle::SwapHorizontal => write!(f, ".1032"), + FSwzShuffle::SwapVertical => write!(f, ".2301"), + } + } +} + +/// Op only present in Kepler and older +/// It first does a shuffle on the second src and then applies +/// src0 op src1, each thread on a quad might do a different operation. +/// +/// This is used to encode ddx/ddy +/// ex: ddx +/// src1 = shuffle swap horizontal src1 +/// ops = [sub, subr, sub, subr] +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpFSwz { + #[dst_type(F32)] + pub dst: Dst, + + #[src_type(GPR)] + pub srcs: [Src; 2], + + pub rnd_mode: FRndMode, + pub ftz: bool, + pub shuffle: FSwzShuffle, + + pub ops: [FSwzAddOp; 4], +} + +impl DisplayOp for OpFSwz { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "fswz{}", self.shuffle)?; + if self.rnd_mode != FRndMode::NearestEven { + write!(f, "{}", self.rnd_mode)?; + } + if self.ftz { + write!(f, ".ftz")?; + } + write!( + f, + " {} {} [{}, {}, {}, {}]", + self.srcs[0], + self.srcs[1], + self.ops[0], + self.ops[1], + self.ops[2], + self.ops[3], + ) + } +} +impl_display_for_op!(OpFSwz); + pub enum RroOp { SinCos, Exp2, @@ -6472,6 +6546,7 @@ pub enum Op { FSet(OpFSet), FSetP(OpFSetP), FSwzAdd(OpFSwzAdd), + FSwz(OpFSwz), DAdd(OpDAdd), DFma(OpDFma), DMnMx(OpDMnMx), @@ -6621,6 +6696,7 @@ impl Op { | Op::HSet2(_) | Op::HSetP2(_) | Op::HMnMx2(_) + | Op::FSwz(_) | Op::FSwzAdd(_) => true, // Multi-function unit is variable latency diff --git a/src/nouveau/compiler/nak/opt_instr_sched_common.rs b/src/nouveau/compiler/nak/opt_instr_sched_common.rs index 113243b99ee..83513930516 100644 --- a/src/nouveau/compiler/nak/opt_instr_sched_common.rs +++ b/src/nouveau/compiler/nak/opt_instr_sched_common.rs @@ -96,6 +96,7 @@ pub fn side_effect_type(op: &Op) -> SideEffect { | Op::HSet2(_) | Op::HSetP2(_) | Op::HMnMx2(_) + | Op::FSwz(_) | Op::FSwzAdd(_) => SideEffect::None, // Multi-function unit