nak: Add a new OpFSwz and use it for derivatives on Kepler
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34540>
This commit is contained in:
Lorenzo Rossi 2025-04-17 21:43:19 -05:00 committed by Marge Bot
parent 309c48cbb7
commit ddbf2ec883
3 changed files with 155 additions and 45 deletions

View file

@ -2293,31 +2293,48 @@ impl<'a> ShaderFromNir<'a> {
assert!(intrin.def.bit_size() == 32);
let ftype = FloatType::F32;
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShfl {
dst: scratch[0].into(),
in_bounds: Dst::None,
src: self.get_src(&srcs[0]),
lane: 1_u32.into(),
c: (0x3_u32 | 0x1c_u32 << 8).into(),
op: ShflOp::Bfly,
});
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
});
if self.sm.sm() >= 50 {
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShfl {
dst: scratch[0].into(),
in_bounds: Dst::None,
src: self.get_src(&srcs[0]),
lane: 1_u32.into(),
c: (0x3_u32 | 0x1c_u32 << 8).into(),
op: ShflOp::Bfly,
});
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
});
} else {
b.push_op(OpFSwz {
dst: dst[0].into(),
srcs: [self.get_src(&srcs[0]), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
shuffle: FSwzShuffle::SwapHorizontal,
});
}
self.set_dst(&intrin.def, dst);
}
@ -2328,31 +2345,47 @@ impl<'a> ShaderFromNir<'a> {
assert!(intrin.def.bit_size() == 32);
let ftype = FloatType::F32;
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShfl {
dst: scratch[0].into(),
in_bounds: Dst::None,
src: self.get_src(&srcs[0]),
lane: 2_u32.into(),
c: (0x3_u32 | 0x1c_u32 << 8).into(),
op: ShflOp::Bfly,
});
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
});
if self.sm.sm() >= 50 {
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShfl {
dst: scratch[0].into(),
in_bounds: Dst::None,
src: self.get_src(&srcs[0]),
lane: 2_u32.into(),
c: (0x3_u32 | 0x1c_u32 << 8).into(),
op: ShflOp::Bfly,
});
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
});
} else {
b.push_op(OpFSwz {
dst: dst[0].into(),
srcs: [self.get_src(&srcs[0]), self.get_src(&srcs[0])],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubRight,
],
rnd_mode: self.float_ctl[ftype].rnd_mode,
ftz: self.float_ctl[ftype].ftz,
shuffle: FSwzShuffle::SwapVertical,
});
}
self.set_dst(&intrin.def, dst);
}

View file

@ -2869,6 +2869,80 @@ impl DisplayOp for OpFSwzAdd {
}
impl_display_for_op!(OpFSwzAdd);
/// Describes where the second src is taken before doing the ops
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum FSwzShuffle {
Quad0,
Quad1,
Quad2,
Quad3,
// swap [0, 1] and [2, 3]
SwapHorizontal,
// swap [0, 2] and [1, 3]
SwapVertical,
}
impl fmt::Display for FSwzShuffle {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FSwzShuffle::Quad0 => write!(f, ".0000"),
FSwzShuffle::Quad1 => write!(f, ".1111"),
FSwzShuffle::Quad2 => write!(f, ".2222"),
FSwzShuffle::Quad3 => write!(f, ".3333"),
FSwzShuffle::SwapHorizontal => write!(f, ".1032"),
FSwzShuffle::SwapVertical => write!(f, ".2301"),
}
}
}
/// Op only present in Kepler and older
/// It first does a shuffle on the second src and then applies
/// src0 op src1, each thread on a quad might do a different operation.
///
/// This is used to encode ddx/ddy
/// ex: ddx
/// src1 = shuffle swap horizontal src1
/// ops = [sub, subr, sub, subr]
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSwz {
#[dst_type(F32)]
pub dst: Dst,
#[src_type(GPR)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
pub ftz: bool,
pub shuffle: FSwzShuffle,
pub ops: [FSwzAddOp; 4],
}
impl DisplayOp for OpFSwz {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "fswz{}", self.shuffle)?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, "{}", self.rnd_mode)?;
}
if self.ftz {
write!(f, ".ftz")?;
}
write!(
f,
" {} {} [{}, {}, {}, {}]",
self.srcs[0],
self.srcs[1],
self.ops[0],
self.ops[1],
self.ops[2],
self.ops[3],
)
}
}
impl_display_for_op!(OpFSwz);
pub enum RroOp {
SinCos,
Exp2,
@ -6472,6 +6546,7 @@ pub enum Op {
FSet(OpFSet),
FSetP(OpFSetP),
FSwzAdd(OpFSwzAdd),
FSwz(OpFSwz),
DAdd(OpDAdd),
DFma(OpDFma),
DMnMx(OpDMnMx),
@ -6621,6 +6696,7 @@ impl Op {
| Op::HSet2(_)
| Op::HSetP2(_)
| Op::HMnMx2(_)
| Op::FSwz(_)
| Op::FSwzAdd(_) => true,
// Multi-function unit is variable latency

View file

@ -96,6 +96,7 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
| Op::HSet2(_)
| Op::HSetP2(_)
| Op::HMnMx2(_)
| Op::FSwz(_)
| Op::FSwzAdd(_) => SideEffect::None,
// Multi-function unit