diff --git a/src/nouveau/compiler/nak_encode_sm75.rs b/src/nouveau/compiler/nak_encode_sm75.rs index 17363370784..2923503a4ab 100644 --- a/src/nouveau/compiler/nak_encode_sm75.rs +++ b/src/nouveau/compiler/nak_encode_sm75.rs @@ -515,6 +515,33 @@ impl SM75Instr { self.set_pred_src(87..90, 90, op.accum); } + fn encode_fswzadd(&mut self, op: &OpFSwzAdd) { + self.set_opcode(0x822); + self.set_dst(op.dst); + + self.set_reg_src(24..32, op.srcs[0]); + self.set_reg_src(64..72, op.srcs[1]); + + let mut subop = 0x0_u8; + + for (i, swz_op) in op.ops.iter().enumerate() { + let swz_op = match swz_op { + FSwzAddOp::Add => 0, + FSwzAddOp::SubRight => 2, + FSwzAddOp::SubLeft => 1, + FSwzAddOp::MoveLeft => 3, + }; + + subop |= swz_op << ((op.ops.len() - i - 1) * 2); + } + + self.set_field(32..40, subop); + + self.set_bit(77, false); /* NDV */ + self.set_rnd_mode(78..80, op.rnd_mode); + self.set_bit(80, false); /* TODO: FTZ */ + } + fn encode_mufu(&mut self, op: &OpMuFu) { self.encode_alu( 0x108, @@ -854,6 +881,54 @@ impl SM75Instr { self.set_pred_src(87..90, 90, op.cond); } + fn encode_shfl(&mut self, op: &OpShfl) { + assert!(op.lane.src_mod.is_none()); + assert!(op.c.src_mod.is_none()); + + match &op.lane.src_ref { + SrcRef::Reg(_) => match &op.c.src_ref { + SrcRef::Reg(_) => { + self.set_opcode(0x389); + self.set_reg_src(32..40, op.lane); + self.set_reg_src(64..72, op.c); + } + SrcRef::Imm32(imm_c) => { + self.set_opcode(0x589); + self.set_reg_src(32..40, op.lane); + self.set_field(40..53, *imm_c); + } + _ => panic!("Invalid instruction form"), + }, + SrcRef::Imm32(imm_lane) => match &op.c.src_ref { + SrcRef::Reg(_) => { + self.set_opcode(0x989); + self.set_field(53..58, *imm_lane); + self.set_reg_src(64..72, op.c); + } + SrcRef::Imm32(imm_c) => { + self.set_opcode(0xf89); + self.set_field(40..53, *imm_c); + self.set_field(53..58, *imm_lane); + } + _ => panic!("Invalid instruction form"), + }, + _ => panic!("Invalid instruction form"), + }; + + self.set_dst(op.dst); + self.set_pred_dst(81..84, Dst::None); + self.set_reg_src(24..32, op.src); + self.set_field( + 58..60, + match op.op { + ShflOp::Idx => 0_u8, + ShflOp::Up => 1_u8, + ShflOp::Down => 2_u8, + ShflOp::Bfly => 3_u8, + }, + ); + } + fn encode_plop3(&mut self, op: &OpPLop3) { self.set_opcode(0x81c); self.set_field(16..24, op.ops[1].lut); @@ -1506,6 +1581,17 @@ impl SM75Instr { self.set_field(90..91, false); /* NOT */ } + fn encode_warpsync(&mut self, op: &OpWarpSync) { + self.encode_alu( + 0x148, + None, + ALUSrc::None, + ALUSrc::Imm32(op.mask), + ALUSrc::None, + ); + self.set_pred_src(87..90, 90, SrcRef::True.into()); + } + fn encode_bar(&mut self, _op: &OpBar) { self.set_opcode(0x31d); @@ -1583,6 +1669,7 @@ impl SM75Instr { Op::FMul(op) => si.encode_fmul(&op), Op::FSet(op) => si.encode_fset(&op), Op::FSetP(op) => si.encode_fsetp(&op), + Op::FSwzAdd(op) => si.encode_fswzadd(&op), Op::MuFu(op) => si.encode_mufu(&op), Op::Brev(op) => si.encode_brev(&op), Op::Flo(op) => si.encode_flo(&op), @@ -1603,6 +1690,7 @@ impl SM75Instr { Op::Mov(op) => si.encode_mov(&op), Op::Prmt(op) => si.encode_prmt(&op), Op::Sel(op) => si.encode_sel(&op), + Op::Shfl(op) => si.encode_shfl(&op), Op::PLop3(op) => si.encode_plop3(&op), Op::Tex(op) => si.encode_tex(&op), Op::Tld(op) => si.encode_tld(&op), @@ -1624,6 +1712,7 @@ impl SM75Instr { Op::MemBar(op) => si.encode_membar(&op), Op::Bra(op) => si.encode_bra(&op, ip, block_offsets), Op::Exit(op) => si.encode_exit(&op), + Op::WarpSync(op) => si.encode_warpsync(&op), Op::Bar(op) => si.encode_bar(&op), Op::CS2R(op) => si.encode_cs2r(&op), Op::Kill(op) => si.encode_kill(&op), diff --git a/src/nouveau/compiler/nak_from_nir.rs b/src/nouveau/compiler/nak_from_nir.rs index ee35e1a1704..2c7f5b924d3 100644 --- a/src/nouveau/compiler/nak_from_nir.rs +++ b/src/nouveau/compiler/nak_from_nir.rs @@ -804,6 +804,68 @@ impl<'a> ShaderFromNir<'a> { }); dst } + nir_op_fddx | nir_op_fddx_coarse | nir_op_fddx_fine => { + // TODO: Real coarse derivatives + + assert!(alu.def.bit_size() == 32); + let scratch = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpWarpSync { mask: u32::MAX }); + b.push_op(OpShfl { + dst: scratch[0].into(), + src: srcs[0], + lane: Src::new_imm_u32(1), + c: Src::new_imm_u32(0x3 | 0x1c << 8), + op: ShflOp::Bfly, + }); + + let dst = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpFSwzAdd { + dst: dst[0].into(), + srcs: [scratch[0].into(), srcs[0]], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + ], + rnd_mode: FRndMode::NearestEven, + }); + + dst + } + nir_op_fddy | nir_op_fddy_coarse | nir_op_fddy_fine => { + // TODO: Real coarse derivatives + + assert!(alu.def.bit_size() == 32); + let scratch = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpWarpSync { mask: u32::MAX }); + b.push_op(OpShfl { + dst: scratch[0].into(), + src: srcs[0], + lane: Src::new_imm_u32(2), + c: Src::new_imm_u32(0x3 | 0x1c << 8), + op: ShflOp::Bfly, + }); + + let dst = b.alloc_ssa(RegFile::GPR, 1); + + b.push_op(OpFSwzAdd { + dst: dst[0].into(), + srcs: [scratch[0].into(), srcs[0]], + ops: [ + FSwzAddOp::SubLeft, + FSwzAddOp::SubLeft, + FSwzAddOp::SubRight, + FSwzAddOp::SubRight, + ], + rnd_mode: FRndMode::NearestEven, + }); + + dst + } _ => panic!("Unsupported ALU instruction: {}", alu.info().name()), }; self.set_dst(&alu.def, dst); diff --git a/src/nouveau/compiler/nak_ir.rs b/src/nouveau/compiler/nak_ir.rs index 8833c1dd2e4..3975b8817ae 100644 --- a/src/nouveau/compiler/nak_ir.rs +++ b/src/nouveau/compiler/nak_ir.rs @@ -1988,6 +1988,59 @@ impl fmt::Display for OpFSetP { } } +#[allow(dead_code)] +#[derive(Clone, Copy, Eq, PartialEq)] +pub enum FSwzAddOp { + Add, + SubRight, + SubLeft, + MoveLeft, +} + +impl fmt::Display for FSwzAddOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + FSwzAddOp::Add => write!(f, "ADD"), + FSwzAddOp::SubRight => write!(f, "SUBR"), + FSwzAddOp::SubLeft => write!(f, "SUB"), + FSwzAddOp::MoveLeft => write!(f, "MOV2"), + } + } +} + +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpFSwzAdd { + pub dst: Dst, + + #[src_type(GPR)] + pub srcs: [Src; 2], + + pub rnd_mode: FRndMode, + + pub ops: [FSwzAddOp; 4], +} + +impl fmt::Display for OpFSwzAdd { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "FSWZADD",)?; + if self.rnd_mode != FRndMode::NearestEven { + write!(f, ".{}", self.rnd_mode)?; + } + write!( + f, + " {} {{ {}, {} }} [{}, {}, {}, {}]", + self.dst, + self.srcs[0], + self.srcs[1], + self.ops[0], + self.ops[1], + self.ops[2], + self.ops[3], + ) + } +} + #[allow(dead_code)] #[derive(Clone, Copy, Eq, PartialEq)] pub enum MuFuOp { @@ -2299,6 +2352,26 @@ impl fmt::Display for OpLop3 { } } +#[allow(dead_code)] +#[derive(Clone, Copy, Eq, PartialEq)] +pub enum ShflOp { + Idx, + Up, + Down, + Bfly, +} + +impl fmt::Display for ShflOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ShflOp::Idx => write!(f, "IDX"), + ShflOp::Up => write!(f, "UP"), + ShflOp::Down => write!(f, "DOWN"), + ShflOp::Bfly => write!(f, "BFLY"), + } + } +} + #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpShf { @@ -2619,6 +2692,33 @@ impl fmt::Display for OpSel { } } +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpShfl { + pub dst: Dst, + + #[src_type(SSA)] + pub src: Src, + + #[src_type(ALU)] + pub lane: Src, + + #[src_type(ALU)] + pub c: Src, + + pub op: ShflOp, +} + +impl fmt::Display for OpShfl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "SHFL.{} {} {{ {}, {}, {} }}", + self.op, self.dst, self.src, self.lane, self.c + ) + } +} + #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpPLop3 { @@ -3254,6 +3354,18 @@ impl fmt::Display for OpExit { } } +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpWarpSync { + pub mask: u32, +} + +impl fmt::Display for OpWarpSync { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "WARPSYNC 0x{:x}", self.mask) + } +} + #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpBar {} @@ -3651,6 +3763,7 @@ pub enum Op { MuFu(OpMuFu), FSet(OpFSet), FSetP(OpFSetP), + FSwzAdd(OpFSwzAdd), DAdd(OpDAdd), Brev(OpBrev), Flo(OpFlo), @@ -3672,6 +3785,7 @@ pub enum Op { Mov(OpMov), Prmt(OpPrmt), Sel(OpSel), + Shfl(OpShfl), PLop3(OpPLop3), Tex(OpTex), Tld(OpTld), @@ -3693,6 +3807,7 @@ pub enum Op { MemBar(OpMemBar), Bra(OpBra), Exit(OpExit), + WarpSync(OpWarpSync), Bar(OpBar), CS2R(OpCS2R), Kill(OpKill), @@ -4016,6 +4131,7 @@ impl Instr { | Op::Kill(_) | Op::Bra(_) | Op::Exit(_) + | Op::WarpSync(_) | Op::Bar(_) | Op::FSOut(_) => false, _ => true, @@ -4030,7 +4146,8 @@ impl Instr { | Op::FMnMx(_) | Op::FMul(_) | Op::FSet(_) - | Op::FSetP(_) => true, + | Op::FSetP(_) + | Op::FSwzAdd(_) => true, // Multi-function unit is variable latency Op::MuFu(_) => false, @@ -4056,6 +4173,7 @@ impl Instr { // Move ops Op::Mov(_) | Op::Prmt(_) | Op::Sel(_) => true, + Op::Shfl(_) => false, // Predicate ops Op::PLop3(_) => true, @@ -4084,6 +4202,7 @@ impl Instr { // Control-flow ops Op::Bra(_) | Op::Exit(_) => true, + Op::WarpSync(_) => false, // Miscellaneous ops Op::Bar(_) diff --git a/src/nouveau/compiler/nak_legalize.rs b/src/nouveau/compiler/nak_legalize.rs index 65a4ffb2049..c9d330631b5 100644 --- a/src/nouveau/compiler/nak_legalize.rs +++ b/src/nouveau/compiler/nak_legalize.rs @@ -46,6 +46,13 @@ fn copy_src(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) { src.src_ref = val.into(); } +fn copy_src_if_cbuf(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) { + match src.src_ref { + SrcRef::CBuf(_) => copy_src(b, src, file), + _ => (), + } +} + fn copy_src_if_not_reg(b: &mut impl SSABuilder, src: &mut Src, file: RegFile) { if !src_is_reg(&src) { copy_src(b, src, file); @@ -232,6 +239,16 @@ fn legalize_instr(b: &mut impl SSABuilder, instr: &mut Instr) { copy_src_if_not_reg(b, src0, RegFile::GPR); copy_src_if_not_reg(b, src2, RegFile::GPR); } + Op::FSwzAdd(op) => { + let [ref mut src0, ref mut src1] = op.srcs; + copy_src_if_not_reg(b, src0, RegFile::GPR); + copy_src_if_not_reg(b, src1, RegFile::GPR); + } + Op::Shfl(op) => { + copy_src_if_not_reg(b, &mut op.src, RegFile::GPR); + copy_src_if_cbuf(b, &mut op.lane, RegFile::GPR); + copy_src_if_cbuf(b, &mut op.c, RegFile::GPR); + } Op::Ldc(_) => (), // Nothing to do Op::Copy(_) => (), // Nothing to do _ => {