From 5a140e7c3e6351f52f0087be8a9f414cd4e3f747 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Fri, 18 Apr 2025 14:23:50 -0500 Subject: [PATCH] nak/sm20: Add more memory ops Part-of: --- src/nouveau/compiler/nak/sm20.rs | 179 +++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/src/nouveau/compiler/nak/sm20.rs b/src/nouveau/compiler/nak/sm20.rs index 04542c11cfb..c24be005a68 100644 --- a/src/nouveau/compiler/nak/sm20.rs +++ b/src/nouveau/compiler/nak/sm20.rs @@ -1862,6 +1862,119 @@ impl SM20Op for OpSt { } } +fn atom_src_as_ssa( + b: &mut LegalizeBuilder, + src: Src, + atom_type: AtomType, +) -> SSARef { + if let Some(ssa) = src.as_ssa() { + return *ssa; + } + + let tmp; + if atom_type.bits() == 32 { + tmp = b.alloc_ssa(RegFile::GPR, 1); + b.copy_to(tmp.into(), 0.into()); + } else { + debug_assert!(atom_type.bits() == 64); + tmp = b.alloc_ssa(RegFile::GPR, 2); + b.copy_to(tmp[0].into(), 0.into()); + b.copy_to(tmp[1].into(), 0.into()); + } + tmp +} + +impl SM20Op for OpAtom { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + if self.atom_op == AtomOp::CmpExch(AtomCmpSrc::Separate) { + let cmpr = atom_src_as_ssa(b, self.cmpr, self.atom_type); + let data = atom_src_as_ssa(b, self.data, self.atom_type); + + let mut cmpr_data = Vec::new(); + cmpr_data.extend_from_slice(&cmpr); + cmpr_data.extend_from_slice(&data); + let cmpr_data = SSARef::try_from(cmpr_data).unwrap(); + + self.cmpr = 0.into(); + self.data = cmpr_data.into(); + self.atom_op = AtomOp::CmpExch(AtomCmpSrc::Packed); + } + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM20Encoder<'_>) { + let MemSpace::Global(addr_type) = self.mem_space else { + panic!("SM20 only supports global atomics"); + }; + assert!(addr_type == MemAddrType::A64); + + if self.dst.is_none() { + e.set_opcode(SM20Unit::Mem, 0x1); + } else { + e.set_opcode(SM20Unit::Mem, 0x11); + } + + let op = match self.atom_op { + AtomOp::Add => 0_u8, + AtomOp::Min => 1_u8, + AtomOp::Max => 2_u8, + AtomOp::Inc => 3_u8, + AtomOp::Dec => 4_u8, + AtomOp::And => 5_u8, + AtomOp::Or => 6_u8, + AtomOp::Xor => 7_u8, + AtomOp::Exch => 8_u8, + AtomOp::CmpExch(_) => 9_u8, + }; + e.set_field(5..9, op); + + let typ = match self.atom_type { + AtomType::F16x2 => panic!("Unsupported atomic type"), + // AtomType::U8 => 0x0_u8, + // AtomType::I8 => 0x1_u8, + // AtomType::U16 => 0x2_u8, + // AtomType::I16 => 0x3_u8, + AtomType::U32 => 0x4_u8, + AtomType::U64 => 0x5_u8, + //AtomType::U128 => 0x6_u8, + AtomType::I32 => 0x7_u8, + AtomType::I64 => 0x8_u8, + //AtomType::I128 => 0x9_u8, + //AtomType::F16 => 0xa_u8, + AtomType::F64 => 0xc_u8, + AtomType::F32 => 0xd_u8, + }; + e.set_field(9..10, typ & 0x1); + e.set_field(59..62, typ >> 1); + + e.set_reg_src(20..26, self.addr); + e.set_reg_src(14..20, self.data); + + if self.dst.is_none() { + e.set_field(26..58, self.addr_offset); + } else { + e.set_dst(43..49, self.dst); + e.set_field(26..43, self.addr_offset & 0x1ffff); + e.set_field(55..58, self.addr_offset >> 17); + } + + if let AtomOp::CmpExch(cmp_src) = self.atom_op { + // The hardware expects the first source to be packed and then the + // second source to be the top half of the first. + assert!(cmp_src == AtomCmpSrc::Packed); + let cmpr_data = self.data.src_ref.as_reg().unwrap(); + assert!(cmpr_data.comps() % 2 == 0); + let data_comps = cmpr_data.comps() / 2; + let data_idx = cmpr_data.base_idx() + u32::from(data_comps); + let data = RegRef::new(cmpr_data.file(), data_idx, data_comps); + + e.set_reg_src(49..55, data.into()); + } else if !self.dst.is_none() { + e.set_reg_src(49..55, 0.into()); + } + } +} + impl SM20Op for OpALd { fn legalize(&mut self, b: &mut LegalizeBuilder) { legalize_ext_instr(self, b); @@ -1940,6 +2053,69 @@ impl SM20Op for OpIpa { } } +impl SM20Op for OpCCtl { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM20Encoder<'_>) { + let op = match self.mem_space { + MemSpace::Global(MemAddrType::A32) => 0x26, + MemSpace::Global(MemAddrType::A64) => 0x27, + MemSpace::Local => panic!("cctl does not support local"), + MemSpace::Shared => 0x34, + }; + e.set_opcode(SM20Unit::Mem, op); + + e.set_field( + 5..10, + match self.op { + CCtlOp::Qry1 => 0_u8, + CCtlOp::PF1 => 1_u8, + CCtlOp::PF1_5 => 2_u8, + CCtlOp::PF2 => 3_u8, + CCtlOp::WB => 4_u8, + CCtlOp::IV => 5_u8, + CCtlOp::IVAll => 6_u8, + CCtlOp::RS => 7_u8, + CCtlOp::WBAll => 8_u8, + CCtlOp::RSLB => 9_u8, + CCtlOp::IVAllP | CCtlOp::WBAllP => { + panic!("cctl{} is not supported on SM20", self.op); + } + }, + ); + e.set_dst(14..20, Dst::None); + e.set_reg_src(20..26, self.addr); + e.set_field(26..28, 0); // 1: .u, 2: .c: 3: .i + + assert!(self.addr_offset % 4 == 0); + if matches!(self.mem_space, MemSpace::Global(_)) { + e.set_field(28..58, self.addr_offset / 4); + } else { + e.set_field(28..50, self.addr_offset / 4); + } + } +} + +impl SM20Op for OpMemBar { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM20Encoder<'_>) { + e.set_opcode(SM20Unit::Mem, 0x38); + e.set_field( + 5..7, + match self.scope { + MemScope::CTA => 0_u8, + MemScope::GPU => 1_u8, + MemScope::System => 2_u8, + }, + ); + } +} + impl SM20Encoder<'_> { fn set_rel_offset(&mut self, range: Range, label: &Label) { let ip = u32::try_from(self.ip).unwrap(); @@ -2199,9 +2375,12 @@ macro_rules! as_sm20_op_match { Op::Ld(op) => op, Op::Ldc(op) => op, Op::St(op) => op, + Op::Atom(op) => op, Op::ALd(op) => op, Op::ASt(op) => op, Op::Ipa(op) => op, + Op::CCtl(op) => op, + Op::MemBar(op) => op, Op::Bra(op) => op, Op::SSy(op) => op, Op::Sync(op) => op,