nak: Add support for fddx and fddy

This uses SHFL in combination with FSWZADD.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24998>
This commit is contained in:
Mary Guillemard 2023-09-07 17:32:56 +02:00 committed by Marge Bot
parent e174fc9ab3
commit 04911df940
4 changed files with 288 additions and 1 deletions

View file

@ -515,6 +515,33 @@ impl SM75Instr {
self.set_pred_src(87..90, 90, op.accum);
}
fn encode_fswzadd(&mut self, op: &OpFSwzAdd) {
self.set_opcode(0x822);
self.set_dst(op.dst);
self.set_reg_src(24..32, op.srcs[0]);
self.set_reg_src(64..72, op.srcs[1]);
let mut subop = 0x0_u8;
for (i, swz_op) in op.ops.iter().enumerate() {
let swz_op = match swz_op {
FSwzAddOp::Add => 0,
FSwzAddOp::SubRight => 2,
FSwzAddOp::SubLeft => 1,
FSwzAddOp::MoveLeft => 3,
};
subop |= swz_op << ((op.ops.len() - i - 1) * 2);
}
self.set_field(32..40, subop);
self.set_bit(77, false); /* NDV */
self.set_rnd_mode(78..80, op.rnd_mode);
self.set_bit(80, false); /* TODO: FTZ */
}
fn encode_mufu(&mut self, op: &OpMuFu) {
self.encode_alu(
0x108,
@ -854,6 +881,54 @@ impl SM75Instr {
self.set_pred_src(87..90, 90, op.cond);
}
fn encode_shfl(&mut self, op: &OpShfl) {
assert!(op.lane.src_mod.is_none());
assert!(op.c.src_mod.is_none());
match &op.lane.src_ref {
SrcRef::Reg(_) => match &op.c.src_ref {
SrcRef::Reg(_) => {
self.set_opcode(0x389);
self.set_reg_src(32..40, op.lane);
self.set_reg_src(64..72, op.c);
}
SrcRef::Imm32(imm_c) => {
self.set_opcode(0x589);
self.set_reg_src(32..40, op.lane);
self.set_field(40..53, *imm_c);
}
_ => panic!("Invalid instruction form"),
},
SrcRef::Imm32(imm_lane) => match &op.c.src_ref {
SrcRef::Reg(_) => {
self.set_opcode(0x989);
self.set_field(53..58, *imm_lane);
self.set_reg_src(64..72, op.c);
}
SrcRef::Imm32(imm_c) => {
self.set_opcode(0xf89);
self.set_field(40..53, *imm_c);
self.set_field(53..58, *imm_lane);
}
_ => panic!("Invalid instruction form"),
},
_ => panic!("Invalid instruction form"),
};
self.set_dst(op.dst);
self.set_pred_dst(81..84, Dst::None);
self.set_reg_src(24..32, op.src);
self.set_field(
58..60,
match op.op {
ShflOp::Idx => 0_u8,
ShflOp::Up => 1_u8,
ShflOp::Down => 2_u8,
ShflOp::Bfly => 3_u8,
},
);
}
fn encode_plop3(&mut self, op: &OpPLop3) {
self.set_opcode(0x81c);
self.set_field(16..24, op.ops[1].lut);
@ -1506,6 +1581,17 @@ impl SM75Instr {
self.set_field(90..91, false); /* NOT */
}
fn encode_warpsync(&mut self, op: &OpWarpSync) {
self.encode_alu(
0x148,
None,
ALUSrc::None,
ALUSrc::Imm32(op.mask),
ALUSrc::None,
);
self.set_pred_src(87..90, 90, SrcRef::True.into());
}
fn encode_bar(&mut self, _op: &OpBar) {
self.set_opcode(0x31d);
@ -1583,6 +1669,7 @@ impl SM75Instr {
Op::FMul(op) => si.encode_fmul(&op),
Op::FSet(op) => si.encode_fset(&op),
Op::FSetP(op) => si.encode_fsetp(&op),
Op::FSwzAdd(op) => si.encode_fswzadd(&op),
Op::MuFu(op) => si.encode_mufu(&op),
Op::Brev(op) => si.encode_brev(&op),
Op::Flo(op) => si.encode_flo(&op),
@ -1603,6 +1690,7 @@ impl SM75Instr {
Op::Mov(op) => si.encode_mov(&op),
Op::Prmt(op) => si.encode_prmt(&op),
Op::Sel(op) => si.encode_sel(&op),
Op::Shfl(op) => si.encode_shfl(&op),
Op::PLop3(op) => si.encode_plop3(&op),
Op::Tex(op) => si.encode_tex(&op),
Op::Tld(op) => si.encode_tld(&op),
@ -1624,6 +1712,7 @@ impl SM75Instr {
Op::MemBar(op) => si.encode_membar(&op),
Op::Bra(op) => si.encode_bra(&op, ip, block_offsets),
Op::Exit(op) => si.encode_exit(&op),
Op::WarpSync(op) => si.encode_warpsync(&op),
Op::Bar(op) => si.encode_bar(&op),
Op::CS2R(op) => si.encode_cs2r(&op),
Op::Kill(op) => si.encode_kill(&op),

View file

@ -804,6 +804,68 @@ impl<'a> ShaderFromNir<'a> {
});
dst
}
nir_op_fddx | nir_op_fddx_coarse | nir_op_fddx_fine => {
// TODO: Real coarse derivatives
assert!(alu.def.bit_size() == 32);
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpWarpSync { mask: u32::MAX });
b.push_op(OpShfl {
dst: scratch[0].into(),
src: srcs[0],
lane: Src::new_imm_u32(1),
c: Src::new_imm_u32(0x3 | 0x1c << 8),
op: ShflOp::Bfly,
});
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), srcs[0]],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
],
rnd_mode: FRndMode::NearestEven,
});
dst
}
nir_op_fddy | nir_op_fddy_coarse | nir_op_fddy_fine => {
// TODO: Real coarse derivatives
assert!(alu.def.bit_size() == 32);
let scratch = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpWarpSync { mask: u32::MAX });
b.push_op(OpShfl {
dst: scratch[0].into(),
src: srcs[0],
lane: Src::new_imm_u32(2),
c: Src::new_imm_u32(0x3 | 0x1c << 8),
op: ShflOp::Bfly,
});
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpFSwzAdd {
dst: dst[0].into(),
srcs: [scratch[0].into(), srcs[0]],
ops: [
FSwzAddOp::SubLeft,
FSwzAddOp::SubLeft,
FSwzAddOp::SubRight,
FSwzAddOp::SubRight,
],
rnd_mode: FRndMode::NearestEven,
});
dst
}
_ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
};
self.set_dst(&alu.def, dst);

View file

@ -1988,6 +1988,59 @@ impl fmt::Display for OpFSetP {
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum FSwzAddOp {
Add,
SubRight,
SubLeft,
MoveLeft,
}
impl fmt::Display for FSwzAddOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FSwzAddOp::Add => write!(f, "ADD"),
FSwzAddOp::SubRight => write!(f, "SUBR"),
FSwzAddOp::SubLeft => write!(f, "SUB"),
FSwzAddOp::MoveLeft => write!(f, "MOV2"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpFSwzAdd {
pub dst: Dst,
#[src_type(GPR)]
pub srcs: [Src; 2],
pub rnd_mode: FRndMode,
pub ops: [FSwzAddOp; 4],
}
impl fmt::Display for OpFSwzAdd {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "FSWZADD",)?;
if self.rnd_mode != FRndMode::NearestEven {
write!(f, ".{}", self.rnd_mode)?;
}
write!(
f,
" {} {{ {}, {} }} [{}, {}, {}, {}]",
self.dst,
self.srcs[0],
self.srcs[1],
self.ops[0],
self.ops[1],
self.ops[2],
self.ops[3],
)
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum MuFuOp {
@ -2299,6 +2352,26 @@ impl fmt::Display for OpLop3 {
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum ShflOp {
Idx,
Up,
Down,
Bfly,
}
impl fmt::Display for ShflOp {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ShflOp::Idx => write!(f, "IDX"),
ShflOp::Up => write!(f, "UP"),
ShflOp::Down => write!(f, "DOWN"),
ShflOp::Bfly => write!(f, "BFLY"),
}
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpShf {
@ -2619,6 +2692,33 @@ impl fmt::Display for OpSel {
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpShfl {
pub dst: Dst,
#[src_type(SSA)]
pub src: Src,
#[src_type(ALU)]
pub lane: Src,
#[src_type(ALU)]
pub c: Src,
pub op: ShflOp,
}
impl fmt::Display for OpShfl {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"SHFL.{} {} {{ {}, {}, {} }}",
self.op, self.dst, self.src, self.lane, self.c
)
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpPLop3 {
@ -3254,6 +3354,18 @@ impl fmt::Display for OpExit {
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpWarpSync {
pub mask: u32,
}
impl fmt::Display for OpWarpSync {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "WARPSYNC 0x{:x}", self.mask)
}
}
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpBar {}
@ -3651,6 +3763,7 @@ pub enum Op {
MuFu(OpMuFu),
FSet(OpFSet),
FSetP(OpFSetP),
FSwzAdd(OpFSwzAdd),
DAdd(OpDAdd),
Brev(OpBrev),
Flo(OpFlo),
@ -3672,6 +3785,7 @@ pub enum Op {
Mov(OpMov),
Prmt(OpPrmt),
Sel(OpSel),
Shfl(OpShfl),
PLop3(OpPLop3),
Tex(OpTex),
Tld(OpTld),
@ -3693,6 +3807,7 @@ pub enum Op {
MemBar(OpMemBar),
Bra(OpBra),
Exit(OpExit),
WarpSync(OpWarpSync),
Bar(OpBar),
CS2R(OpCS2R),
Kill(OpKill),
@ -4016,6 +4131,7 @@ impl Instr {
| Op::Kill(_)
| Op::Bra(_)
| Op::Exit(_)
| Op::WarpSync(_)
| Op::Bar(_)
| Op::FSOut(_) => false,
_ => true,
@ -4030,7 +4146,8 @@ impl Instr {
| Op::FMnMx(_)
| Op::FMul(_)
| Op::FSet(_)
| Op::FSetP(_) => true,
| Op::FSetP(_)
| Op::FSwzAdd(_) => true,
// Multi-function unit is variable latency
Op::MuFu(_) => false,
@ -4056,6 +4173,7 @@ impl Instr {
// Move ops
Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => true,
Op::Shfl(_) => false,
// Predicate ops
Op::PLop3(_) => true,
@ -4084,6 +4202,7 @@ impl Instr {
// Control-flow ops
Op::Bra(_) | Op::Exit(_) => true,
Op::WarpSync(_) => false,
// Miscellaneous ops
Op::Bar(_)

View file

@ -46,6 +46,13 @@ fn copy_src(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) {
src.src_ref = val.into();
}
fn copy_src_if_cbuf(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) {
match src.src_ref {
SrcRef::CBuf(_) => copy_src(b, src, file),
_ => (),
}
}
fn copy_src_if_not_reg(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) {
if !src_is_reg(&src) {
copy_src(b, src, file);
@ -232,6 +239,16 @@ fn legalize_instr(b: &mut impl SSABuilder, instr: &mut Instr) {
copy_src_if_not_reg(b, src0, RegFile::GPR);
copy_src_if_not_reg(b, src2, RegFile::GPR);
}
Op::FSwzAdd(op) => {
let [ref mut src0, ref mut src1] = op.srcs;
copy_src_if_not_reg(b, src0, RegFile::GPR);
copy_src_if_not_reg(b, src1, RegFile::GPR);
}
Op::Shfl(op) => {
copy_src_if_not_reg(b, &mut op.src, RegFile::GPR);
copy_src_if_cbuf(b, &mut op.lane, RegFile::GPR);
copy_src_if_cbuf(b, &mut op.c, RegFile::GPR);
}
Op::Ldc(_) => (), // Nothing to do
Op::Copy(_) => (), // Nothing to do
_ => {