diff --git a/src/nouveau/compiler/nak/sm32.rs b/src/nouveau/compiler/nak/sm32.rs index 6f4034348f4..d097e6c6276 100644 --- a/src/nouveau/compiler/nak/sm32.rs +++ b/src/nouveau/compiler/nak/sm32.rs @@ -2,8 +2,13 @@ // SPDX-License-Identifier: MIT use crate::ir::*; -use crate::legalize::LegalizeBuilder; -use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetFieldU64}; +use crate::legalize::{ + src_is_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers, LegalizeBuilder, +}; +use bitview::{ + BitMutView, BitMutViewable, BitView, BitViewable, SetBit, SetField, + SetFieldU64, +}; use std::{collections::HashMap, ops::Range}; @@ -137,6 +142,14 @@ trait SM32Op { fn encode(&self, e: &mut SM32Encoder<'_>); } +fn zero_reg() -> RegRef { + RegRef::new(RegFile::GPR, 255, 1) +} + +fn true_reg() -> RegRef { + RegRef::new(RegFile::Pred, 7, 1) +} + #[allow(dead_code)] struct SM32Encoder<'a> { sm: &'a ShaderModel32, @@ -167,23 +180,2719 @@ impl SetFieldU64 for SM32Encoder<'_> { BitMutView::new(&mut self.inst).set_field_u64(range, val); } } -macro_rules! as_sm32_op_match { + +impl SM32Encoder<'_> { + fn set_opcode(&mut self, opcode: u16, functional_unit: u8) { + self.set_field(52..64, opcode); + + assert!(functional_unit < 3); + self.set_field(0..2, functional_unit); + } + + fn set_pred_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 3); + assert!(reg.file() == RegFile::Pred); + assert!(reg.base_idx() <= 7); + assert!(reg.comps() == 1); + self.set_field(range, reg.base_idx()); + } + + fn set_pred_src(&mut self, range: Range, src: &Src) { + // The default for predicates is true + let (not, reg) = match src.src_ref { + SrcRef::True => (false, true_reg()), + SrcRef::False => (true, true_reg()), + SrcRef::Reg(reg) => (false, reg), + _ => panic!("Not a register"), + }; + self.set_pred_reg(range.start..(range.end - 1), reg); + self.set_bit(range.end - 1, not ^ src.src_mod.is_bnot()); + } + + fn set_pred_dst(&mut self, range: Range, dst: &Dst) { + let reg = match dst { + Dst::None => true_reg(), + Dst::Reg(reg) => *reg, + _ => panic!("Dst is not pred {dst}"), + }; + self.set_pred_reg(range, reg); + } + + fn set_pred(&mut self, pred: &Pred) { + // predicates are 4 bits starting at 18, last one denotes inversion + assert!(!pred.is_false()); + self.set_pred_reg( + 18..21, + match pred.pred_ref { + PredRef::None => true_reg(), + PredRef::Reg(reg) => reg, + PredRef::SSA(_) => panic!("SSA values must be lowered"), + }, + ); + self.set_bit(21, pred.pred_inv); + } + + fn set_reg(&mut self, range: Range, reg: RegRef) { + assert!(range.len() == 8); + assert!(reg.file() == RegFile::GPR); + self.set_field(range, reg.base_idx()); + } + + fn set_reg_src_ref(&mut self, range: Range, src_ref: &SrcRef) { + let reg = match src_ref { + SrcRef::Zero => zero_reg(), + SrcRef::Reg(reg) => *reg, + _ => panic!("Not a register"), + }; + self.set_reg(range, reg); + } + + fn set_reg_src(&mut self, range: Range, src: &Src) { + assert!(src.src_swizzle.is_none()); + self.set_reg_src_ref(range, &src.src_ref); + } + + fn set_reg_fmod_src( + &mut self, + range: Range, + abs_bit: usize, + neg_bit: usize, + src: &Src, + ) { + self.set_reg_src_ref(range, &src.src_ref); + self.set_bit(abs_bit, src.src_mod.has_fabs()); + self.set_bit(neg_bit, src.src_mod.has_fneg()); + } + + fn set_dst(&mut self, dst: &Dst) { + let reg = match dst { + Dst::None => zero_reg(), + Dst::Reg(reg) => *reg, + _ => panic!("Invalid dst {dst}"), + }; + self.set_reg(2..10, reg); + } + + fn set_src_imm_i20( + &mut self, + range: Range, + sign_bit: usize, + i: u32, + ) { + assert!(range.len() == 19); + assert!((i & 0xfff80000) == 0 || (i & 0xfff80000) == 0xfff80000); + + self.set_field(range, i & 0x7ffff); + self.set_field(sign_bit..sign_bit + 1, (i & 0x80000) >> 19); + } + + fn set_src_imm_f20( + &mut self, + range: Range, + sign_bit: usize, + f: u32, + ) { + assert!(range.len() == 19); + assert!((f & 0x00000fff) == 0); + + self.set_field(range, (f >> 12) & 0x7ffff); + self.set_field(sign_bit..sign_bit + 1, f >> 31); + } + + fn set_src_cbuf(&mut self, range: Range, cb: &CBufRef) { + let mut v = BitMutView::new_subset(self, range); + + assert!(cb.offset % 4 == 0); + v.set_field(0..14, cb.offset >> 2); + + let CBuf::Binding(idx) = cb.buf else { + panic!("Must be a bound constant buffer"); + }; + + v.set_field(14..19, idx); + } + + fn set_rnd_mode(&mut self, range: Range, rnd_mode: FRndMode) { + assert!(range.len() == 2); + self.set_field( + range, + match rnd_mode { + FRndMode::NearestEven => 0_u8, + FRndMode::NegInf => 1_u8, + FRndMode::PosInf => 2_u8, + FRndMode::Zero => 3_u8, + }, + ); + } + + fn set_instr_dependency(&mut self, _deps: &InstrDeps) { + // TODO: schedulng + //let mut sched = BitMutView::new(&mut self.sched); + //sched.set_field(0..5, deps.delay); + self.sched = 0x00; + // 0x00: wait for 32 cycles + // 0x04: dual-issue with next instruction + // 0xc2 if TEXBAR + // 0x40 if EXPORT + // 0x20 otherwise(?) + + // 0x80: global memory bit + // 0x40: EXPORT(?) + // 0x20: suspend for N cycles (N = bitmask 0x1f) + // 0x10: shared memory? + } +} + +// +// Small helper for encoding of ALU instructions +// +enum AluSrc { + Reg(RegRef), + Imm(u32), + CBuf(CBufRef), +} + +impl AluSrc { + fn from_src(src: &Src) -> AluSrc { + assert!(src.src_swizzle.is_none()); + // do not assert src_mod, can be encoded by opcode. + + match &src.src_ref { + SrcRef::Zero => AluSrc::Reg(zero_reg()), + SrcRef::Reg(r) => AluSrc::Reg(*r), + SrcRef::Imm32(x) => AluSrc::Imm(*x), + SrcRef::CBuf(x) => AluSrc::CBuf(x.clone()), + _ => panic!("Unhandled ALU src type"), + } + } +} + +impl SM32Encoder<'_> { + fn encode_form_immreg( + &mut self, + opcode_imm: u16, + opcode_reg: u16, + dst: Option<&Dst>, + src0: &Src, + src1: &Src, + src2: Option<&Src>, + is_imm_float: bool, + ) { + // There are 4 possible forms: + // rir: 10...01 (only one with immediate) + // rcr: 01...10 (c = constant buf reference) + // rrc: 10...10 + // rrr: 11...10 + // other forms are invalid (or encode other instructions) + enum Form { + RIR, + RCR, + RRC, + RRR, + } + let src1 = AluSrc::from_src(src1); + let src2 = src2.map(|s| AluSrc::from_src(s)); + + if let Some(dst) = dst { + self.set_dst(dst); + } + + // SRC[0] must always be a register + self.set_reg_src(10..18, src0); + + let form = match src1 { + AluSrc::Imm(imm) => { + self.set_opcode(opcode_imm, 1); + + if is_imm_float { + self.set_src_imm_f20(23..42, 59, imm); + } else { + self.set_src_imm_i20(23..42, 59, imm); + } + match src2 { + None => {} + Some(AluSrc::Reg(src2)) => self.set_reg(42..50, src2), + _ => panic!("Invalid form"), + } + Form::RIR + } + AluSrc::CBuf(cb) => { + self.set_opcode(opcode_reg, 2); + self.set_src_cbuf(23..42, &cb); + match src2 { + None => {} + Some(AluSrc::Reg(src2)) => self.set_reg(42..50, src2), + _ => panic!("Invalid form"), + } + Form::RCR + } + AluSrc::Reg(r1) => { + self.set_opcode(opcode_reg, 2); + match src2 { + None => { + self.set_reg(23..31, r1); + Form::RRR + } + Some(AluSrc::Reg(r2)) => { + self.set_reg(23..31, r1); + self.set_reg(42..50, r2); + Form::RRR + } + Some(AluSrc::CBuf(cb)) => { + self.set_src_cbuf(23..42, &cb); + self.set_reg(42..50, r1); + Form::RRC + } + _ => panic!("Invalid form"), + } + } + }; + + // Set form selector + let form_sel: u8 = match form { + Form::RCR => 0b01, + Form::RRC => 0b10, + Form::RRR => 0b11, + Form::RIR => return, // don't set high bits, reserved for opcode + }; + assert!(self.get_bit_range_u64(62..64) == 0); + self.set_field(62..64, form_sel); + } +} + +// +// Implementations of SM32Op for each op we support on KeplerB +// + +impl SM32Op for OpFAdd { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32); + + if src1.as_imm_not_f20().is_some() + && self.rnd_mode != FRndMode::NearestEven + { + // Hardware cannot encode long-immediate + rounding mode + b.copy_alu_src(src1, GPR, SrcType::F32); + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + if let Some(imm32) = self.srcs[1].as_imm_not_f20() { + e.set_opcode(0x400, 0); + e.set_dst(&self.dst); + e.set_reg_fmod_src(10..18, 57, 59, &self.srcs[0]); + e.set_field(23..55, imm32); + + assert!(self.rnd_mode == FRndMode::NearestEven); + + e.set_bit(22, self.saturate); + e.set_bit(56, self.srcs[1].src_mod.has_fneg()); + e.set_bit(58, self.ftz); + e.set_bit(60, self.srcs[1].src_mod.has_fabs()); + } else { + e.encode_form_immreg( + 0xc2c, + 0x22c, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_rnd_mode(42..44, self.rnd_mode); + e.set_bit(47, self.ftz); + e.set_bit(48, self.srcs[1].src_mod.has_fneg()); + e.set_bit(49, self.srcs[0].src_mod.has_fabs()); + // 50: .cc? + e.set_bit(51, self.srcs[0].src_mod.has_fneg()); + e.set_bit(52, self.srcs[1].src_mod.has_fabs()); + e.set_bit(53, self.saturate); + } + } +} + +impl SM32Op for OpFFma { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1, src2] = &mut self.srcs; + b.copy_alu_src_if_fabs(src0, GPR, SrcType::F32); + b.copy_alu_src_if_fabs(src1, GPR, SrcType::F32); + b.copy_alu_src_if_fabs(src2, GPR, SrcType::F32); + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32); + if src_is_reg(src1, GPR) { + b.copy_alu_src_if_imm(src2, GPR, SrcType::F32); + } else { + b.copy_alu_src_if_not_reg(src2, GPR, SrcType::F32); + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // ffma doesn't have any abs flags. + assert!(!self.srcs[0].src_mod.has_fabs()); + assert!(!self.srcs[1].src_mod.has_fabs()); + assert!(!self.srcs[2].src_mod.has_fabs()); + + // There is one fneg bit shared by the two fmul sources + let fneg_fmul = + self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg(); + + // Technically, ffma also supports a 32-bit immediate, + // but only in the case where the destination is the + // same as src2. We don't support that right now. + e.encode_form_immreg( + 0x940, + 0x0c0, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + Some(&self.srcs[2]), + true, + ); + + e.set_bit(51, fneg_fmul); + e.set_bit(52, self.srcs[2].src_mod.has_fneg()); + e.set_bit(53, self.saturate); + e.set_rnd_mode(54..56, self.rnd_mode); + + e.set_bit(56, self.ftz); + e.set_bit(57, self.dnz); + } +} + +impl SM32Op for OpFMnMx { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc30, + 0x230, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_pred_src(42..46, &self.min); + e.set_bit(47, self.ftz); + e.set_bit(48, self.srcs[1].src_mod.has_fneg()); + e.set_bit(49, self.srcs[0].src_mod.has_fabs()); + // 50: .cc? + e.set_bit(51, self.srcs[0].src_mod.has_fneg()); + e.set_bit(52, self.srcs[1].src_mod.has_fabs()); + } +} + +impl SM32Op for OpFMul { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + b.copy_alu_src_if_fabs(src0, GPR, SrcType::F32); + b.copy_alu_src_if_fabs(src1, GPR, SrcType::F32); + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + + if src1.as_imm_not_f20().is_some() + && self.rnd_mode != FRndMode::NearestEven + { + // Hardware cannot encode long-immediate + rounding mode + b.copy_alu_src(src1, GPR, SrcType::F32); + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // fmul doesn't have any abs flags. + assert!(!self.srcs[0].src_mod.has_fabs()); + assert!(!self.srcs[1].src_mod.has_fabs()); + + // Hw doesn't like ftz and dnz together + assert!(!(self.ftz && self.dnz)); + + // There is one fneg bit shared by both sources + let fneg = + self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg(); + + if let Some(mut limm) = self.srcs[1].as_imm_not_f20() { + e.set_opcode(0x200, 2); + e.set_dst(&self.dst); + + e.set_reg_src(10..18, &self.srcs[0]); + if fneg { + // Flip the immediate sign bit + limm ^= 0x80000000; + } + e.set_field(23..55, limm); + + assert!(self.rnd_mode == FRndMode::NearestEven); + e.set_bit(56, self.ftz); + e.set_bit(57, self.dnz); + } else { + e.encode_form_immreg( + 0xc34, + 0x234, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_rnd_mode(42..44, self.rnd_mode); + e.set_bit(47, self.ftz); + e.set_bit(48, self.dnz); + e.set_bit(51, fneg); + } + } +} + +impl SM32Op for OpRro { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_imm(&mut self.src, GPR, SrcType::F32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // also: 0xc48, 1 is the immediate form (not really useful) + + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe48, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x648, 2); + e.set_src_cbuf(23..42, &cb); + } + _ => panic!("Invalid Rro src"), + } + + e.set_dst(&self.dst); + + e.set_field( + 42..43, + match self.op { + RroOp::SinCos => 0u8, + RroOp::Exp2 => 1u8, + }, + ); + e.set_bit(48, self.src.src_mod.has_fneg()); + e.set_bit(52, self.src.src_mod.has_fabs()); + } +} + +impl SM32Op for OpMuFu { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + b.copy_alu_src_if_not_reg(&mut self.src, RegFile::GPR, SrcType::GPR); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x840, 2); + + e.set_dst(&self.dst); + e.set_reg_fmod_src(10..18, 49, 51, &self.src); + + e.set_field( + 23..27, + match self.op { + MuFuOp::Cos => 0_u8, + MuFuOp::Sin => 1_u8, + MuFuOp::Exp2 => 2_u8, + MuFuOp::Log2 => 3_u8, + MuFuOp::Rcp => 4_u8, + MuFuOp::Rsq => 5_u8, + MuFuOp::Rcp64H => 6_u8, + MuFuOp::Rsq64H => 7_u8, + MuFuOp::Sqrt => panic!("MUFU.SQRT not supported on SM32"), + MuFuOp::Tanh => panic!("MUFU.TANH not supported on SM32"), + }, + ); + } +} + +impl SM32Encoder<'_> { + fn set_float_cmp_op(&mut self, range: Range, op: FloatCmpOp) { + assert!(range.len() == 4); + self.set_field( + range, + match op { + FloatCmpOp::OrdLt => 0x01_u8, + FloatCmpOp::OrdEq => 0x02_u8, + FloatCmpOp::OrdLe => 0x03_u8, + FloatCmpOp::OrdGt => 0x04_u8, + FloatCmpOp::OrdNe => 0x05_u8, + FloatCmpOp::OrdGe => 0x06_u8, + FloatCmpOp::UnordLt => 0x09_u8, + FloatCmpOp::UnordEq => 0x0a_u8, + FloatCmpOp::UnordLe => 0x0b_u8, + FloatCmpOp::UnordGt => 0x0c_u8, + FloatCmpOp::UnordNe => 0x0d_u8, + FloatCmpOp::UnordGe => 0x0e_u8, + FloatCmpOp::IsNum => 0x07_u8, + FloatCmpOp::IsNan => 0x08_u8, + }, + ); + } + + fn set_pred_set_op(&mut self, range: Range, op: PredSetOp) { + assert!(range.len() == 2); + self.set_field( + range, + match op { + PredSetOp::And => 0_u8, + PredSetOp::Or => 1_u8, + PredSetOp::Xor => 2_u8, + }, + ); + } + + fn set_int_cmp_op(&mut self, range: Range, op: IntCmpOp) { + assert!(range.len() == 3); + self.set_field( + range, + match op { + IntCmpOp::False => 0_u8, + IntCmpOp::True => 7_u8, + IntCmpOp::Eq => 2_u8, + IntCmpOp::Ne => 5_u8, + IntCmpOp::Lt => 1_u8, + IntCmpOp::Le => 3_u8, + IntCmpOp::Gt => 4_u8, + IntCmpOp::Ge => 6_u8, + }, + ); + } +} + +impl SM32Op for OpFSet { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.cmp_op = self.cmp_op.flip(); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0x800, + 0x000, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_pred_src(42..46, &SrcRef::True.into()); + e.set_bit(46, self.srcs[0].src_mod.has_fneg()); + e.set_bit(47, self.srcs[1].src_mod.has_fabs()); + + // 48..50: and, or, xor? + e.set_float_cmp_op(51..55, self.cmp_op); + + e.set_bit(56, self.srcs[1].src_mod.has_fneg()); + e.set_bit(57, self.srcs[0].src_mod.has_fabs()); + + e.set_bit(58, self.ftz); + } +} + +impl SM32Op for OpFSetP { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.cmp_op = self.cmp_op.flip(); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F32); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xb58, + 0x1d8, + None, + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + e.set_pred_dst(2..5, &Dst::None); // dst1 + e.set_pred_dst(5..8, &self.dst); // dst0 + e.set_pred_src(42..46, &self.accum); + + e.set_bit(8, self.srcs[1].src_mod.has_fneg()); + e.set_bit(9, self.srcs[0].src_mod.has_fabs()); + e.set_bit(46, self.srcs[0].src_mod.has_fneg()); + e.set_bit(47, self.srcs[1].src_mod.has_fabs()); + + e.set_pred_set_op(48..50, self.set_op); + e.set_bit(50, self.ftz); + e.set_float_cmp_op(51..55, self.cmp_op); + } +} + +impl SM32Op for OpFSwz { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.srcs[0], GPR, SrcType::GPR); + b.copy_alu_src_if_not_reg(&mut self.srcs[1], GPR, SrcType::GPR); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7fc, 2); + + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.srcs[1]); + e.set_reg_src(23..31, &self.srcs[1]); + + e.set_field( + 42..44, + match self.rnd_mode { + FRndMode::NearestEven => 0u8, + FRndMode::NegInf => 1u8, + FRndMode::PosInf => 2u8, + FRndMode::Zero => 3u8, + }, + ); + + for (i, op) in self.ops.iter().enumerate() { + e.set_field( + 31 + i * 2..31 + (i + 1) * 2, + match op { + FSwzAddOp::Add => 0u8, + FSwzAddOp::SubLeft => 1u8, + FSwzAddOp::SubRight => 2u8, + FSwzAddOp::MoveLeft => 3u8, + }, + ); + } + + // Shuffle mode + e.set_field( + 44..47, + match self.shuffle { + FSwzShuffle::Quad0 => 0_u8, + FSwzShuffle::Quad1 => 1_u8, + FSwzShuffle::Quad2 => 2_u8, + FSwzShuffle::Quad3 => 3_u8, + FSwzShuffle::SwapHorizontal => 4_u8, + FSwzShuffle::SwapVertical => 5_u8, + }, + ); + + e.set_bit(41, false); // .NDV + e.set_bit(47, self.ftz); // .FTZ + e.set_bit(50, false); // .CC + } +} + +impl SM32Op for OpDAdd { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc38, + 0x238, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_rnd_mode(42..44, self.rnd_mode); + // 47: .ftz + e.set_bit(48, self.srcs[1].src_mod.has_fneg()); + e.set_bit(49, self.srcs[0].src_mod.has_fabs()); + // 50: .cc? + e.set_bit(51, self.srcs[0].src_mod.has_fneg()); + e.set_bit(52, self.srcs[1].src_mod.has_fabs()); + // 53: .sat + } +} + +impl SM32Op for OpDFma { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1, src2] = &mut self.srcs; + b.copy_alu_src_if_fabs(src0, GPR, SrcType::F64); + b.copy_alu_src_if_fabs(src1, GPR, SrcType::F64); + b.copy_alu_src_if_fabs(src2, GPR, SrcType::F64); + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64); + if src_is_reg(src1, GPR) { + b.copy_alu_src_if_imm(src2, GPR, SrcType::F64); + } else { + b.copy_alu_src_if_not_reg(src2, GPR, SrcType::F64); + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // ffma doesn't have any abs flags. + assert!(!self.srcs[0].src_mod.has_fabs()); + assert!(!self.srcs[1].src_mod.has_fabs()); + assert!(!self.srcs[2].src_mod.has_fabs()); + + // There is one fneg bit shared by the two fmul sources + let fneg_fmul = + self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg(); + + e.encode_form_immreg( + 0xb38, + 0x1b8, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + Some(&self.srcs[2]), + true, + ); + + e.set_bit(51, fneg_fmul); + e.set_bit(52, self.srcs[2].src_mod.has_fneg()); + e.set_rnd_mode(53..55, self.rnd_mode); + } +} + +impl SM32Op for OpDMnMx { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc28, + 0x228, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_pred_src(42..46, &self.min); + e.set_bit(48, self.srcs[1].src_mod.has_fneg()); + e.set_bit(49, self.srcs[0].src_mod.has_fabs()); + // 50: .cc? + e.set_bit(51, self.srcs[0].src_mod.has_fneg()); + e.set_bit(52, self.srcs[1].src_mod.has_fabs()); + } +} + +impl SM32Op for OpDMul { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + b.copy_alu_src_if_fabs(src0, GPR, SrcType::F64); + b.copy_alu_src_if_fabs(src1, GPR, SrcType::F64); + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // fmul doesn't have any abs flags. + assert!(!self.srcs[0].src_mod.has_fabs()); + assert!(!self.srcs[1].src_mod.has_fabs()); + + // There is one fneg bit shared by both sources + let fneg = + self.srcs[0].src_mod.has_fneg() ^ self.srcs[1].src_mod.has_fneg(); + + e.encode_form_immreg( + 0xc40, + 0x240, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + true, + ); + + e.set_rnd_mode(42..44, self.rnd_mode); + e.set_bit(51, fneg); + } +} + +impl SM32Op for OpDSetP { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.cmp_op = self.cmp_op.flip(); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::F64); + b.copy_alu_src_if_f20_overflow(src1, GPR, SrcType::F64); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xb40, + 0x1c0, + None, + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + e.set_pred_dst(2..5, &Dst::None); // dst1 + e.set_pred_dst(5..8, &self.dst); // dst0 + e.set_pred_src(42..46, &self.accum); + + e.set_bit(8, self.srcs[1].src_mod.has_fneg()); + e.set_bit(9, self.srcs[0].src_mod.has_fabs()); + e.set_bit(46, self.srcs[0].src_mod.has_fneg()); + e.set_bit(47, self.srcs[1].src_mod.has_fabs()); + + e.set_pred_set_op(48..50, self.set_op); + // 50: ftz + e.set_float_cmp_op(51..55, self.cmp_op); + } +} + +impl SM32Op for OpBfe { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.base, GPR, SrcType::ALU); + if let SrcRef::Imm32(imm) = &mut self.range.src_ref { + *imm = *imm & 0xffff; // Only the lower 2 bytes matter + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc00, + 0x200, + Some(&self.dst), + &self.base, + &self.range, + None, + false, + ); + + e.set_bit(43, self.reverse); + e.set_bit(51, self.signed); + } +} + +impl SM32Op for OpFlo { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_imm(&mut self.src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe18, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x618, 2); + e.set_src_cbuf(23..42, &cb); + } + _ => panic!("Invalid flo src"), + } + + e.set_bit(43, self.src.src_mod.is_bnot()); + e.set_bit(44, self.return_shift_amount); + e.set_bit(51, self.signed); + + e.set_dst(&self.dst); + } +} + +impl SM32Op for OpIAdd2 { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + if src0.src_mod.is_ineg() && src1.src_mod.is_ineg() { + assert!(self.carry_out.is_none()); + b.copy_alu_src_and_lower_ineg(src0, GPR, SrcType::I32); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::I32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // Hardware requires at least one of these be unmodified. Otherwise, it + // encodes as iadd.po which isn't what we want. + assert!( + self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none() + ); + + let carry_out = match &self.carry_out { + Dst::Reg(reg) if reg.file() == RegFile::Carry => true, + Dst::None => false, + dst => panic!("Invalid iadd carry_out: {dst}"), + }; + + if let Some(limm) = self.srcs[1].as_imm_not_i20() { + e.set_opcode(0x400, 1); + e.set_dst(&self.dst); + + e.set_reg_src(10..18, &self.srcs[0]); + e.set_field(23..55, limm); + + e.set_bit(59, self.srcs[0].src_mod.is_ineg()); + e.set_bit(55, carry_out); // .cc + e.set_bit(56, false); // .X + } else { + e.encode_form_immreg( + 0xc08, + 0x208, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_bit(52, self.srcs[0].src_mod.is_ineg()); + e.set_bit(51, self.srcs[1].src_mod.is_ineg()); + e.set_bit(50, carry_out); + e.set_bit(46, false); // .x + } + } +} + +impl SM32Op for OpIAdd2X { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::I32); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + match &self.carry_in.src_ref { + SrcRef::Reg(reg) if reg.file() == RegFile::Carry => (), + src => panic!("Invalid iadd.x carry_in: {src}"), + } + + let carry_out = match &self.carry_out { + Dst::Reg(reg) if reg.file() == RegFile::Carry => true, + Dst::None => false, + dst => panic!("Invalid iadd.x carry_out: {dst}"), + }; + + if let Some(limm) = self.srcs[1].as_imm_not_i20() { + e.set_opcode(0x400, 1); + e.set_dst(&self.dst); + + e.set_reg_src(10..18, &self.srcs[0]); + e.set_field(23..55, limm); + + e.set_bit(59, self.srcs[0].src_mod.is_bnot()); + e.set_bit(55, carry_out); // .cc + e.set_bit(56, true); // .X + } else { + e.encode_form_immreg( + 0xc08, + 0x208, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_bit(52, self.srcs[0].src_mod.is_bnot()); + e.set_bit(51, self.srcs[1].src_mod.is_bnot()); + e.set_bit(50, carry_out); + e.set_bit(46, true); // .x + } + } +} + +impl SM32Op for OpIMad { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1, src2] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU); + if src_is_reg(src1, GPR) { + b.copy_alu_src_if_imm(src2, GPR, SrcType::ALU); + } else { + b.copy_alu_src_if_not_reg(src2, GPR, SrcType::ALU); + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // TODO: long immediate, ex: 0xa1081800019c0801 (not handled by codegen) + e.encode_form_immreg( + 0xa00, + 0x110, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + Some(&self.srcs[2]), + false, + ); + // 57: .hi + e.set_bit(56, self.signed); + e.set_bit( + 55, + self.srcs[0].src_mod.is_ineg() ^ self.srcs[1].src_mod.is_ineg(), + ); + e.set_bit(54, self.srcs[2].src_mod.is_ineg()); + // 53: .sat + // 52: .x + e.set_bit(51, self.signed); + // 50: cc + // 22: .s + } +} + +impl SM32Op for OpIMul { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.signed.swap(0, 1); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + assert!(self.srcs[0].src_mod.is_none()); + assert!(self.srcs[1].src_mod.is_none()); + + if let Some(limm) = self.srcs[1].as_imm_not_i20() { + e.set_opcode(0x280, 2); + e.set_dst(&self.dst); + + e.set_reg_src(10..18, &self.srcs[0]); + e.set_field(23..55, limm); + + e.set_bit(58, self.signed[1]); + e.set_bit(57, self.signed[0]); + e.set_bit(56, self.high); + } else { + e.encode_form_immreg( + 0xc1c, + 0x21c, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_bit(44, self.signed[1]); + e.set_bit(43, self.signed[0]); + e.set_bit(42, self.high); + } + } +} + +impl SM32Op for OpIMnMx { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc10, + 0x210, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_pred_src(42..46, &self.min); + // 46..48: ?|xlo|xmed|xhi + e.set_bit( + 51, + match self.cmp_type { + IntCmpType::U32 => false, + IntCmpType::I32 => true, + }, + ); + } +} + +impl SM32Op for OpISetP { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.cmp_op = self.cmp_op.flip(); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xb30, + 0x1b0, + None, + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + e.set_pred_dst(2..5, &Dst::None); // dst1 + e.set_pred_dst(5..8, &self.dst); // dst0 + e.set_pred_src(42..46, &self.accum); + + e.set_bit(46, self.ex); + e.set_pred_set_op(48..50, self.set_op); + + e.set_field( + 51..52, + match self.cmp_type { + IntCmpType::U32 => 0_u8, + IntCmpType::I32 => 1_u8, + }, + ); + e.set_int_cmp_op(52..55, self.cmp_op); + } +} + +impl SM32Op for OpLop2 { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + match self.op { + LogicOp2::And | LogicOp2::Or | LogicOp2::Xor => { + swap_srcs_if_not_reg(src0, src1, GPR); + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + } + LogicOp2::PassB => { + *src0 = 0.into(); + b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU); + } + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + if let Some(limm) = self.srcs[1].as_imm_not_i20() { + e.set_opcode(0x200, 0); + + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.srcs[0]); + e.set_field(23..55, limm); + e.set_field( + 56..58, + match self.op { + LogicOp2::And => 0_u8, + LogicOp2::Or => 1_u8, + LogicOp2::Xor => 2_u8, + LogicOp2::PassB => panic!("Not supported for imm32"), + }, + ); + e.set_bit(58, self.srcs[0].src_mod.is_bnot()); + e.set_bit(59, self.srcs[1].src_mod.is_bnot()); + } else { + e.encode_form_immreg( + 0xc20, + 0x220, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_bit(42, self.srcs[0].src_mod.is_bnot()); + e.set_bit(43, self.srcs[1].src_mod.is_bnot()); + + e.set_field( + 44..46, + match self.op { + LogicOp2::And => 0_u8, + LogicOp2::Or => 1_u8, + LogicOp2::Xor => 2_u8, + LogicOp2::PassB => 3_u8, + }, + ); + } + } +} + +impl SM32Op for OpPopC { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // popc on Kepler takes two sources and ANDs them and counts the + // intersecting bits. Pass it !rZ as the second source. + let mask = Src::from(0).bnot(); + e.encode_form_immreg( + 0xc04, + 0x204, + Some(&self.dst), + &mask, + &self.src, + None, + false, + ); + e.set_bit(42, mask.src_mod.is_bnot()); + e.set_bit(43, self.src.src_mod.is_bnot()); + } +} + +impl SM32Op for OpShf { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.high, GPR, SrcType::ALU); + b.copy_alu_src_if_not_reg(&mut self.low, GPR, SrcType::GPR); + b.copy_alu_src_if_not_reg_or_imm(&mut self.shift, GPR, SrcType::GPR); + b.copy_alu_src_if_i20_overflow(&mut self.shift, GPR, SrcType::GPR); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + if self.right { + e.encode_form_immreg( + 0xc7c, + 0x27c, + Some(&self.dst), + &self.low, + &self.shift, + Some(&self.high), + false, + ); + } else { + e.encode_form_immreg( + 0xb7c, + 0x1fc, + Some(&self.dst), + &self.low, + &self.shift, + Some(&self.high), + false, + ); + } + + e.set_bit(53, self.wrap); + e.set_bit(52, false); // .x + + // Fun behavior: As for Maxwell, Kepler does not support shf.l.hi + // but it still always takes the high part of the result. + // If we encode .hi it traps with illegal instruction encoding. + // We can encode a low shf.l by using only the high part and + // hard-wiring the low part to rZ. + assert!(self.right || self.dst_high); + e.set_bit(51, self.right && self.dst_high); // .hi + e.set_bit(50, false); // .cc + + e.set_field( + 40..42, + match self.data_type { + IntType::I32 => 0_u8, + IntType::U32 => 0_u8, + IntType::U64 => 2_u8, + IntType::I64 => 3_u8, + _ => panic!("Invalid shift data type"), + }, + ); + e.set_bit(22, false); // .s + } +} + +impl SM32Op for OpShl { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR); + if let SrcRef::Imm32(imm32) = &mut self.shift.src_ref { + if self.wrap { + *imm32 = *imm32 & 0x1f; + } else { + *imm32 = (*imm32).min(32); + } + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc24, + 0x224, + Some(&self.dst), + &self.src, + &self.shift, + None, + false, + ); + e.set_bit(42, self.wrap); + // 46: .x(?) + // 50: .cc(?) + } +} + +impl SM32Op for OpShr { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR); + if let SrcRef::Imm32(imm32) = &mut self.shift.src_ref { + if self.wrap { + *imm32 = *imm32 & 0x1f; + } else { + *imm32 = (*imm32).min(32); + } + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc14, + 0x214, + Some(&self.dst), + &self.src, + &self.shift, + None, + false, + ); + e.set_bit(42, self.wrap); + // 43: .brev + // 47: .x(?) + // 50: .cc(?) + e.set_bit(51, self.signed); + } +} + +impl SM32Op for OpF2F { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let src = &mut self.src; + // No immediates supported + b.copy_alu_src_if_imm(src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // integer_rnd on SM32 is inferred automatically when + // the src_type and dst_type are the same. + assert!(!self.integer_rnd || (self.src_type == self.dst_type)); + + e.set_dst(&self.dst); + + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe54, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x654, 2); + e.set_src_cbuf(23..42, cb); + } + src => panic!("Invalid f2f src: {src}"), + } + + // We can't span 32 bits + assert!( + (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32) + || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32) + ); + e.set_field(10..12, (self.dst_type.bits() / 8).ilog2()); + e.set_field(12..14, (self.src_type.bits() / 8).ilog2()); + + e.set_rnd_mode(42..44, self.rnd_mode); + e.set_bit(44, self.high); + e.set_bit(45, self.integer_rnd); + e.set_bit(47, self.ftz); + e.set_bit(48, self.src.src_mod.has_fneg()); + e.set_bit(50, false); // dst.CC + e.set_bit(52, self.src.src_mod.has_fabs()); + e.set_bit(53, false); // saturate + } +} + +impl SM32Op for OpF2I { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let src = &mut self.src; + // No immediates supported + b.copy_alu_src_if_imm(src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_dst(&self.dst); + + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe58, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x658, 2); + e.set_src_cbuf(23..42, cb); + } + src => panic!("Invalid f2i src: {src}"), + } + + // We can't span 32 bits + assert!( + (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32) + || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32) + ); + e.set_field(10..12, (self.dst_type.bits() / 8).ilog2()); + e.set_field(12..14, (self.src_type.bits() / 8).ilog2()); + e.set_bit(14, self.dst_type.is_signed()); + + e.set_rnd_mode(42..44, self.rnd_mode); + // 44: .h1 + e.set_bit(47, self.ftz); + e.set_bit(48, self.src.src_mod.has_fneg()); + e.set_bit(50, false); // dst.CC + e.set_bit(52, self.src.src_mod.has_fabs()); + e.set_bit(53, false); // saturate + } +} + +impl SM32Op for OpI2F { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let src = &mut self.src; + // No immediates supported + b.copy_alu_src_if_imm(src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_dst(&self.dst); + + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe5c, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x65c, 2); + e.set_src_cbuf(23..42, cb); + } + src => panic!("Invalid i2f src: {src}"), + } + + // We can't span 32 bits + assert!( + (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32) + || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32) + ); + e.set_field(10..12, (self.dst_type.bits() / 8).ilog2()); + e.set_field(12..14, (self.src_type.bits() / 8).ilog2()); + e.set_bit(15, self.src_type.is_signed()); + + e.set_rnd_mode(42..44, self.rnd_mode); + e.set_field(44..46, 0); // .b0-3 + e.set_bit(48, self.src.src_mod.is_ineg()); + e.set_bit(50, false); // dst.CC + e.set_bit(52, false); // iabs + } +} + +impl SM32Op for OpI2I { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let src = &mut self.src; + // No immediates supported + b.copy_alu_src_if_imm(src, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_dst(&self.dst); + + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe60, 2); + e.set_reg_src(23..31, &self.src); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x660, 2); + e.set_src_cbuf(23..42, cb); + } + src => panic!("Invalid i2i src: {src}"), + } + + assert!( + (self.dst_type.bits() <= 32 && self.src_type.bits() <= 32) + || (self.dst_type.bits() >= 32 && self.src_type.bits() >= 32) + ); + e.set_field(10..12, (self.dst_type.bits() / 8).ilog2()); + e.set_field(12..14, (self.src_type.bits() / 8).ilog2()); + e.set_bit(14, self.dst_type.is_signed()); + e.set_bit(15, self.src_type.is_signed()); + + e.set_field(44..46, 0u8); // src.B1-3 + e.set_bit(48, self.neg); + e.set_bit(50, false); // dst.CC + e.set_bit(52, self.abs); + e.set_bit(53, self.saturate); + } +} + +impl SM32Op for OpMov { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + match &self.src.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_opcode(0xe4c, 2); + e.set_reg_src(23..31, &self.src); + e.set_field(42..46, self.quad_lanes); + } + SrcRef::Imm32(limm) => { + e.set_opcode(0x747, 2); + e.set_field(23..55, *limm); + + e.set_field(14..18, self.quad_lanes); + } + SrcRef::CBuf(cb) => { + e.set_opcode(0x64c, 2); + e.set_src_cbuf(23..42, cb); + e.set_field(42..46, self.quad_lanes); + } + src => panic!("Invalid mov src: {src}"), + } + + e.set_dst(&self.dst); + } +} + +impl SM32Op for OpPrmt { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.srcs[0], GPR, SrcType::GPR); + b.copy_alu_src_if_not_reg(&mut self.srcs[1], GPR, SrcType::GPR); + + if let SrcRef::Imm32(imm32) = &mut self.sel.src_ref { + // Only the bottom 16 bits matter anyway + *imm32 = *imm32 & 0xffff; + } + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xb60, + 0x1e0, + Some(&self.dst), + &self.srcs[0], + &self.sel, + Some(&self.srcs[1]), + false, + ); + + e.set_field( + 51..54, + match self.mode { + PrmtMode::Index => 0_u8, + PrmtMode::Forward4Extract => 1_u8, + PrmtMode::Backward4Extract => 2_u8, + PrmtMode::Replicate8 => 3_u8, + PrmtMode::EdgeClampLeft => 4_u8, + PrmtMode::EdgeClampRight => 5_u8, + PrmtMode::Replicate16 => 6_u8, + }, + ); + } +} + +impl SM32Op for OpSel { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + let [src0, src1] = &mut self.srcs; + if swap_srcs_if_not_reg(src0, src1, GPR) { + self.cond = self.cond.clone().bnot(); + } + b.copy_alu_src_if_not_reg(src0, GPR, SrcType::ALU); + b.copy_alu_src_if_i20_overflow(src1, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xc50, + 0x250, + Some(&self.dst), + &self.srcs[0], + &self.srcs[1], + None, + false, + ); + + e.set_pred_src(42..46, &self.cond); + } +} + +impl SM32Op for OpShfl { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.src, GPR, SrcType::GPR); + b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, GPR, SrcType::ALU); + b.copy_alu_src_if_not_reg_or_imm(&mut self.c, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x788, 2); + + e.set_dst(&self.dst); + e.set_pred_dst(51..54, &self.in_bounds); + e.set_reg_src(10..18, &self.src); + + e.set_field( + 33..35, + match self.op { + ShflOp::Idx => 0u8, + ShflOp::Up => 1u8, + ShflOp::Down => 2u8, + ShflOp::Bfly => 3u8, + }, + ); + + match &self.lane.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_reg_src(23..31, &self.lane); + e.set_bit(31, false); + } + SrcRef::Imm32(imm32) => { + e.set_field(23..28, *imm32 & 0x1f); + e.set_bit(31, true); + } + src => panic!("Invalid shfl lane: {src}"), + } + match &self.c.src_ref { + SrcRef::Zero | SrcRef::Reg(_) => { + e.set_bit(32, false); + e.set_reg_src(42..50, &self.c); + } + SrcRef::Imm32(imm32) => { + e.set_bit(32, true); + e.set_field(37..50, *imm32 & 0x1f1f); + } + src => panic!("Invalid shfl c: {src}"), + } + } +} + +impl SM32Op for OpPSetP { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x848, 2); + + e.set_pred_dst(5..8, &self.dsts[0]); + e.set_pred_dst(2..5, &self.dsts[1]); + + e.set_pred_src(14..18, &self.srcs[0]); + e.set_pred_src(32..36, &self.srcs[1]); + e.set_pred_src(42..46, &self.srcs[2]); + + e.set_pred_set_op(27..29, self.ops[0]); + e.set_pred_set_op(48..50, self.ops[1]); + // 22: .s + } +} + +impl SM32Encoder<'_> { + fn set_tex_dim(&mut self, range: Range, dim: TexDim) { + assert!(range.len() == 3); + self.set_field( + range, + match dim { + TexDim::_1D => 0_u8, + TexDim::Array1D => 1_u8, + TexDim::_2D => 2_u8, + TexDim::Array2D => 3_u8, + TexDim::_3D => 4_u8, + // 5: array_3d + TexDim::Cube => 6_u8, + TexDim::ArrayCube => 7_u8, + }, + ); + } + + fn set_tex_lod_mode(&mut self, range: Range, lod_mode: TexLodMode) { + assert!(range.len() == 3); + self.set_field( + range, + match lod_mode { + TexLodMode::Auto => 0_u8, + TexLodMode::Zero => 1_u8, + TexLodMode::Bias => 2_u8, + TexLodMode::Lod => 3_u8, + // 6: lba + // 7: lla + _ => panic!("Unknown LOD mode"), + }, + ); + } +} + +/// Helper to legalize texture instructions +fn legalize_tex_instr(op: &mut impl SrcsAsSlice, _b: &mut LegalizeBuilder) { + // Texture instructions have one or two sources. When they have two, the + // second one is optional and we can set rZ instead. + let srcs = op.srcs_as_mut_slice(); + assert!(matches!(&srcs[0].src_ref, SrcRef::SSA(_))); + if srcs.len() > 1 { + debug_assert!(srcs.len() == 2); + assert!(matches!(&srcs[1].src_ref, SrcRef::SSA(_) | SrcRef::Zero)); + } +} + +/// Helper to legalize extended or external instructions +/// +/// These are instructions which reach out external units such as load/store +/// and texture ops. They typically can't take anything but GPRs and are the +/// only types of instructions that support vectors. +/// +fn legalize_ext_instr(op: &mut impl SrcsAsSlice, _b: &mut LegalizeBuilder) { + let src_types = op.src_types(); + for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() { + match src_types[i] { + SrcType::SSA => { + assert!(src.as_ssa().is_some()); + } + SrcType::GPR => { + assert!(src_is_reg(src, RegFile::GPR)); + } + SrcType::ALU + | SrcType::F16 + | SrcType::F16v2 + | SrcType::F32 + | SrcType::F64 + | SrcType::I32 + | SrcType::B32 => { + panic!("ALU srcs must be legalized explicitly"); + } + SrcType::Pred => { + panic!("Predicates must be legalized explicitly"); + } + SrcType::Carry => { + panic!("Carry values must be legalized explicitly"); + } + SrcType::Bar => panic!("Barrier regs are Volta+"), + } + } +} + +impl SM32Encoder<'_> { + fn set_mem_type(&mut self, range: Range, mem_type: MemType) { + assert!(range.len() == 3); + self.set_field( + range, + match mem_type { + MemType::U8 => 0_u8, + MemType::I8 => 1_u8, + MemType::U16 => 2_u8, + MemType::I16 => 3_u8, + MemType::B32 => 4_u8, + MemType::B64 => 5_u8, + MemType::B128 => 6_u8, + }, + ); + } + + fn set_mem_access(&mut self, range: Range, access: &MemAccess) { + self.set_field( + range.start..range.start + 1, + match access.space.addr_type() { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + self.set_mem_type(range.start + 1..range.end, access.mem_type); + // TODO: order and scope aren't present before SM70, what should we do? + } +} + +impl SM32Op for OpLd { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // Missing: + // 0x774 for load locked + // 0x7c8 for indirect const load + match self.access.space { + MemSpace::Global(_) => { + e.set_opcode(0xc00, 0); + + e.set_field(23..55, self.offset); + e.set_mem_access(55..59, &self.access); + // 59..61 cache hints (.ca, .cg, .cs, .cv) + } + MemSpace::Local | MemSpace::Shared => { + let opc = match self.access.space { + MemSpace::Local => 0x7a0, + MemSpace::Shared => 0x7a4, + _ => unreachable!(), + }; + e.set_opcode(opc, 2); + + // 22..23 .s? + e.set_field(23..47, self.offset); + // 47..49: cache hints (.ca, .cg, .lu, .cv) + e.set_mem_access(50..54, &self.access); + } + } + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.addr); + } +} + +impl SM32Op for OpLdc { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.offset, GPR, SrcType::GPR); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + assert!(self.cb.src_mod.is_none()); + let SrcRef::CBuf(cb) = &self.cb.src_ref else { + panic!("Not a CBuf source"); + }; + let CBuf::Binding(cb_idx) = cb.buf else { + panic!("Must be a bound constant buffer"); + }; + + e.set_opcode(0x7c8, 2); + + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.offset); + e.set_field(23..39, cb.offset); + e.set_field(39..44, cb_idx); + e.set_field( + 47..49, + match self.mode { + LdcMode::Indexed => 0_u8, + LdcMode::IndexedLinear => 1_u8, + LdcMode::IndexedSegmented => 2_u8, + LdcMode::IndexedSegmentedLinear => 3_u8, + }, + ); + e.set_mem_type(51..54, self.mem_type); + } +} + +impl SM32Op for OpSt { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // Missing: + // 0x784 for store locked + match self.access.space { + MemSpace::Global(_) => { + e.set_opcode(0xe00, 0); + + e.set_field(23..55, self.offset); + e.set_mem_access(55..59, &self.access); + // 59..61 cache hints (.ca, .cg, .cs, .cv) + } + MemSpace::Local | MemSpace::Shared => { + let opc = match self.access.space { + MemSpace::Local => 0x7a8, + MemSpace::Shared => 0x7ac, + _ => unreachable!(), + }; + e.set_opcode(opc, 2); + + // 22..23 .s? + e.set_field(23..47, self.offset); + // 47..49: cache hints (.ca, .cg, .lu, .cv) + e.set_mem_access(50..54, &self.access); + } + } + e.set_reg_src(2..10, &self.data); + e.set_reg_src(10..18, &self.addr); + } +} + +fn atom_src_as_ssa( + b: &mut LegalizeBuilder, + src: &Src, + atom_type: AtomType, +) -> SSARef { + if let Some(ssa) = src.as_ssa() { + return ssa.clone(); + } + + if atom_type.bits() == 32 { + let tmp = b.alloc_ssa(RegFile::GPR); + b.copy_to(tmp.into(), 0.into()); + tmp.into() + } else { + debug_assert!(atom_type.bits() == 64); + let tmp = b.alloc_ssa_vec(RegFile::GPR, 2); + b.copy_to(tmp[0].into(), 0.into()); + b.copy_to(tmp[1].into(), 0.into()); + tmp + } +} + +impl SM32Encoder<'_> { + fn set_atom_op(&mut self, range: Range, atom_op: AtomOp) { + self.set_field( + range, + match atom_op { + AtomOp::Add => 0_u8, + AtomOp::Min => 1_u8, + AtomOp::Max => 2_u8, + AtomOp::Inc => 3_u8, + AtomOp::Dec => 4_u8, + AtomOp::And => 5_u8, + AtomOp::Or => 6_u8, + AtomOp::Xor => 7_u8, + AtomOp::Exch => 8_u8, + AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"), + // TODO: SafeAdd => 0xa_u8 ? + }, + ); + } +} + +impl SM32Op for OpAtom { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + if self.atom_op == AtomOp::CmpExch(AtomCmpSrc::Separate) { + let cmpr = atom_src_as_ssa(b, &self.cmpr, self.atom_type); + let data = atom_src_as_ssa(b, &self.data, self.atom_type); + + let mut cmpr_data = Vec::new(); + cmpr_data.extend_from_slice(&cmpr); + cmpr_data.extend_from_slice(&data); + let cmpr_data = SSARef::try_from(cmpr_data).unwrap(); + + self.cmpr = 0.into(); + self.data = cmpr_data.into(); + self.atom_op = AtomOp::CmpExch(AtomCmpSrc::Packed); + } + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + match self.mem_space { + MemSpace::Global(addr_type) => { + if let AtomOp::CmpExch(cmp_src) = self.atom_op { + assert!(!self.dst.is_none()); + e.set_opcode(0x778, 2); + e.set_dst(&self.dst); + + // TODO: These are all supported by the disassembler but + // only the packed layout appears to be supported by real + // hardware (same as SM50) + let (data_src, data_layout) = match cmp_src { + AtomCmpSrc::Separate => { + if self.data.is_zero() { + (&self.cmpr, 1_u8) + } else { + assert!(self.cmpr.is_zero()); + (&self.data, 2_u8) + } + } + AtomCmpSrc::Packed => (&self.data, 0_u8), + }; + e.set_reg_src(23..31, data_src); + let data_type = match self.atom_type { + AtomType::U32 => 0_u8, + AtomType::U64 => 1_u8, + _ => panic!("Unsupported data type"), + }; + e.set_field(52..53, data_type); + e.set_field(53..55, data_layout); + } else { + e.set_opcode(0x680, 2); + e.set_dst(&self.dst); + e.set_reg_src(23..31, &self.data); + + let data_type = match self.atom_type { + AtomType::U32 => 0_u8, + AtomType::I32 => 1_u8, + AtomType::U64 => 2_u8, + AtomType::F32 => 3_u8, + // NOTE: U128 => 4_u8, + AtomType::I64 => 5_u8, + _ => panic!("Unsupported data type"), + }; + e.set_field(52..55, data_type); + + e.set_atom_op(55..59, self.atom_op); + } + // TODO: mem_order + + e.set_reg_src(10..18, &self.addr); + e.set_field(31..51, self.addr_offset); + + e.set_field( + 51..52, + match addr_type { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + } + MemSpace::Local => panic!("Atomics do not support local"), + MemSpace::Shared => panic!( + "Shared atomics should be lowered into ld-locked and st-locked" + ), + } + } +} + +impl SM32Op for OpAL2P { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7d0, 2); + + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.offset); + // 22: .s + e.set_field(23..34, self.addr); + e.set_bit(35, self.output); + + assert!(self.comps == 1); + e.set_field(50..52, 0_u8); // comps + } +} + +impl SM32Op for OpALd { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7ec, 2); + + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.offset); + // 22: .s + e.set_field(23..34, self.addr); + + if self.phys { + assert!(!self.patch); + assert!(self.offset.src_ref.as_reg().is_some()); + } else if !self.patch { + assert!(self.offset.is_zero()); + } + e.set_bit(34, self.patch); + e.set_bit(35, self.output); + e.set_reg_src(42..50, &self.vtx); + + e.set_field(50..52, self.comps - 1); + } +} + +impl SM32Op for OpASt { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7f0, 2); + + e.set_reg_src(2..10, &self.data); + e.set_reg_src(10..18, &self.offset); + // 22: .s + e.set_field(23..34, self.addr); + + if self.phys { + assert!(!self.patch); + assert!(self.offset.src_ref.as_reg().is_some()); + } else if !self.patch { + assert!(self.offset.is_zero()); + } + e.set_bit(34, self.patch); + e.set_reg_src(42..50, &self.vtx); + + e.set_field(50..52, self.comps - 1); + } +} + +impl SM32Op for OpIpa { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x748, 2); + + e.set_dst(&self.dst); + e.set_reg_src(42..50, &self.offset); + e.set_reg_src(23..31, &self.inv_w); + + assert!(self.addr % 4 == 0); + e.set_field(31..42, self.addr); + + e.set_reg_src_ref(10..18, &SrcRef::Zero); // indirect addr + + e.set_bit(50, false); // .SAT + e.set_field( + 51..53, + match self.loc { + InterpLoc::Default => 0_u8, + InterpLoc::Centroid => 1_u8, + InterpLoc::Offset => 2_u8, + }, + ); + e.set_field( + 53..55, + match self.freq { + InterpFreq::Pass => 0_u8, + InterpFreq::PassMulW => 1_u8, + InterpFreq::Constant => 2_u8, + InterpFreq::State => 3_u8, + }, + ); + } +} + +impl SM32Op for OpCCtl { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + match self.mem_space { + MemSpace::Global(addr_type) => { + e.set_opcode(0x7b0, 2); + + assert!(self.addr_offset % 4 == 0); + e.set_field(25..55, self.addr_offset / 4); + e.set_field( + 55..56, + match addr_type { + MemAddrType::A32 => 0_u8, + MemAddrType::A64 => 1_u8, + }, + ); + } + MemSpace::Local => panic!("cctl does not support local"), + MemSpace::Shared => { + e.set_opcode(0x7c0, 2); + + assert!(self.addr_offset % 4 == 0); + e.set_field(25..47, self.addr_offset / 4); + } + } + e.set_field( + 2..6, + match self.op { + CCtlOp::Qry1 => 0_u8, + CCtlOp::PF1 => 1_u8, + CCtlOp::PF1_5 => 2_u8, + CCtlOp::PF2 => 3_u8, + CCtlOp::WB => 4_u8, + CCtlOp::IV => 5_u8, + CCtlOp::IVAll => 6_u8, + CCtlOp::RS => 7_u8, + CCtlOp::RSLB => 7_u8, + op => panic!("Unsupported cache control {op:?}"), + }, + ); + e.set_reg_src(10..18, &self.addr); + } +} + +impl SM32Op for OpMemBar { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7cc, 2); + + e.set_field( + 10..12, + match self.scope { + MemScope::CTA => 0_u8, + MemScope::GPU => 1_u8, + MemScope::System => 2_u8, + }, + ); + } +} + +impl SM32Encoder<'_> { + fn set_rel_offset(&mut self, range: Range, label: &Label) { + assert!(range.len() == 24); + assert!(self.ip % 8 == 0); + + let ip = u32::try_from(self.ip).unwrap(); + let ip = i32::try_from(ip).unwrap(); + + let target_ip = *self.labels.get(label).unwrap(); + let target_ip = u32::try_from(target_ip).unwrap(); + let target_ip = i32::try_from(target_ip).unwrap(); + assert!(target_ip % 8 == 0); + + let rel_offset = target_ip - ip - 8; + + assert!(rel_offset % 8 == 0); + self.set_field(range, rel_offset); + } +} + +impl SM32Op for OpBra { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x120, 0); + e.set_field(2..6, 0xf_u8); // Condition code + // 7: bra to cbuf + // 8: .lmt (limit) + // 9: .u (uniform for warp) + e.set_rel_offset(23..47, &self.target); + } +} + +impl SM32Op for OpSSy { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x148, 0); + e.set_field(2..8, 0xf_u8); // flags + e.set_rel_offset(23..47, &self.target); + } +} + +impl SM32Op for OpSync { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + // emit nop.s + e.set_opcode(0x858, 2); + e.set_field(10..14, 0xf_u8); // flags + e.set_bit(22, true); // .s + } +} + +impl SM32Op for OpBrk { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x1a0, 0); + e.set_field(2..8, 0xf_u8); // flags + } +} + +impl SM32Op for OpPBk { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x150, 0); + e.set_field(2..8, 0xf_u8); // flags + e.set_rel_offset(23..47, &self.target); + } +} + +impl SM32Op for OpCont { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x1a8, 0); + e.set_field(2..8, 0xf_u8); // flags + } +} + +impl SM32Op for OpPCnt { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x158, 0); + e.set_field(2..8, 0xf_u8); // flags + e.set_rel_offset(23..47, &self.target); + } +} + +impl SM32Op for OpExit { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x180, 0); + e.set_field(2..8, 0xf_u8); // flags + } +} + +impl SM32Op for OpBar { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x854, 2); + + // Barrier id + e.set_reg_src_ref(10..18, &SrcRef::Zero); + // Thread count + e.set_reg_src_ref(23..31, &SrcRef::Zero); + + // 00: SYNC + // 01: ARV + // 02: RED + // 03: SCAN + // 04: SYNCALL + e.set_field(35..38, 0); + + // (only for RED) + // 00: .POPC + // 01: .AND + // 02: .OR + e.set_field(38..40, 0); + + // actually only useful for reductions. + e.set_pred_src(42..46, &SrcRef::True.into()); + + // 46: 1 if barr_id is immediate (imm: 10..18, max: 0xff) + // 47: 1 if thread_count is immediate (imm: 23..35, max: 0xfff) + } +} + +impl SM32Op for OpTexDepBar { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x770, 2); + // Max N of textures in queue + e.set_field(23..29, self.textures_left); + } +} + +impl SM32Op for OpViLd { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7f8, 2); + e.set_dst(&self.dst); + e.set_reg_src(10..18, &self.idx); + e.set_field(23..31, self.off); + } +} + +impl SM32Op for OpKill { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x198, 0); + e.set_field(2..8, 0xf_u8); // flags + } +} + +impl SM32Op for OpNop { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x858, 2); + e.set_field(10..14, 0xf_u8); // flags + } +} + +impl SM32Op for OpPixLd { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + legalize_ext_instr(self, b); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x7f4, 2); + e.set_dst(&self.dst); + + e.set_reg_src(10..18, &0.into()); // addr + e.set_field(23..31, 0_u16); // offset + + e.set_field( + 34..37, + match &self.val { + PixVal::MsCount => 0_u8, + PixVal::CovMask => 1_u8, + PixVal::Covered => 2_u8, + PixVal::Offset => 3_u8, + PixVal::CentroidOffset => 4_u8, + PixVal::MyIndex => 5_u8, + other => panic!("Unsupported PixVal: {other}"), + }, + ); + } +} + +impl SM32Op for OpS2R { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x864, 2); + e.set_dst(&self.dst); + e.set_field(23..31, self.idx); + } +} + +impl SM32Op for OpVote { + fn legalize(&mut self, _b: &mut LegalizeBuilder) { + // Nothing to do + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.set_opcode(0x86c, 2); + + e.set_dst(&self.ballot); + e.set_pred_dst(48..51, &self.vote); + + e.set_pred_src(42..46, &self.pred); + + e.set_field( + 51..53, + match self.op { + VoteOp::All => 0u8, + VoteOp::Any => 1u8, + VoteOp::Eq => 2u8, + }, + ); + } +} + +impl SM32Op for OpOut { + fn legalize(&mut self, b: &mut LegalizeBuilder) { + use RegFile::GPR; + b.copy_alu_src_if_not_reg(&mut self.handle, GPR, SrcType::GPR); + b.copy_alu_src_if_i20_overflow(&mut self.stream, GPR, SrcType::ALU); + } + + fn encode(&self, e: &mut SM32Encoder<'_>) { + e.encode_form_immreg( + 0xb70, + 0x1f0, + Some(&self.dst), + &self.handle, + &self.stream, + None, + false, + ); + + e.set_field( + 42..44, + match self.out_type { + OutType::Emit => 1_u8, + OutType::Cut => 2_u8, + OutType::EmitThenCut => 3_u8, + }, + ); + } +} + +// TODO: instructions left behind from codegen rewrite, +// we might use them in the future: +// - 0x138 pret.noinc +// - 0x1b8 quadon (enable all threads in quad) +// - 0x1c0 quadpop (redisable them) +// - 0x190 ret +macro_rules! as_sm50_op_match { ($op: expr) => { match $op { + Op::FAdd(op) => op, + Op::FFma(op) => op, + Op::FMnMx(op) => op, + Op::FMul(op) => op, + Op::Rro(op) => op, + Op::MuFu(op) => op, + Op::FSet(op) => op, + Op::FSetP(op) => op, + Op::FSwz(op) => op, + Op::DAdd(op) => op, + Op::DFma(op) => op, + Op::DMnMx(op) => op, + Op::DMul(op) => op, + Op::DSetP(op) => op, + Op::Bfe(op) => op, + Op::Flo(op) => op, + Op::IAdd2(op) => op, + Op::IAdd2X(op) => op, + Op::IMad(op) => op, + Op::IMul(op) => op, + Op::IMnMx(op) => op, + Op::ISetP(op) => op, + Op::Lop2(op) => op, + Op::PopC(op) => op, + Op::Shf(op) => op, + Op::Shl(op) => op, + Op::Shr(op) => op, + Op::F2F(op) => op, + Op::F2I(op) => op, + Op::I2F(op) => op, + Op::I2I(op) => op, + Op::Mov(op) => op, + Op::Prmt(op) => op, + Op::Sel(op) => op, + Op::Shfl(op) => op, + Op::PSetP(op) => op, + Op::Ld(op) => op, + Op::Ldc(op) => op, + Op::St(op) => op, + Op::Atom(op) => op, + Op::AL2P(op) => op, + Op::ALd(op) => op, + Op::ASt(op) => op, + Op::Ipa(op) => op, + Op::CCtl(op) => op, + Op::MemBar(op) => op, + Op::Bra(op) => op, + Op::SSy(op) => op, + Op::Sync(op) => op, + Op::Brk(op) => op, + Op::PBk(op) => op, + Op::Cont(op) => op, + Op::PCnt(op) => op, + Op::Exit(op) => op, + Op::Bar(op) => op, + Op::TexDepBar(op) => op, + Op::ViLd(op) => op, + Op::Kill(op) => op, + Op::Nop(op) => op, + Op::PixLd(op) => op, + Op::S2R(op) => op, + Op::Vote(op) => op, + Op::Out(op) => op, _ => panic!("Unhandled instruction {}", $op), } }; } -#[allow(dead_code)] fn as_sm32_op(op: &Op) -> &dyn SM32Op { - as_sm32_op_match!(op) + as_sm50_op_match!(op) } fn as_sm32_op_mut(op: &mut Op) -> &mut dyn SM32Op { - as_sm32_op_match!(op) + as_sm50_op_match!(op) } -fn encode_sm32_shader(_sm: &ShaderModel32, _s: &Shader<'_>) -> Vec { - todo!("Implement SM32 encoding"); +fn encode_instr( + instr: Option<&Box>, + sm: &ShaderModel32, + labels: &HashMap, + encoded: &mut Vec, +) -> u8 { + let mut e = SM32Encoder { + sm: sm, + ip: encoded.len() * 4, + labels, + inst: [0_u32; 2], + sched: 0, + }; + + if let Some(instr) = instr { + as_sm32_op(&instr.op).encode(&mut e); + e.set_pred(&instr.pred); + e.set_instr_dependency(&instr.deps); + } else { + let nop = OpNop { label: None }; + nop.encode(&mut e); + e.set_pred(&true.into()); + e.set_instr_dependency(&InstrDeps::new()); + } + + encoded.extend(&e.inst[..]); + + e.sched +} + +fn encode_sm32_shader(sm: &ShaderModel32, s: &Shader<'_>) -> Vec { + assert!(s.functions.len() == 1); + let func = &s.functions[0]; + + let mut ip = 0_usize; + let mut labels = HashMap::new(); + for b in &func.blocks { + // We ensure blocks will have groups of 7 instructions with a + // schedule instruction before each groups. As we should never jump + // to a schedule instruction, we account for that here. + labels.insert(b.label, ip + 8); + + let block_num_instrs = b.instrs.len().next_multiple_of(7); + + // Every 7 instructions, we have a new schedule instruction so we + // need to account for that. + ip += (block_num_instrs + (block_num_instrs / 7)) * 8; + } + + let mut encoded = Vec::new(); + for b in &func.blocks { + for sched_chunk in b.instrs.chunks(7) { + let sched_i = encoded.len(); + let mut sched_instr = [0u32; 2]; + encoded.extend(&sched_instr[..]); // Push now, will edit later + let mut bv = BitMutView::new(&mut sched_instr); + bv.set_field(0..2, 0b00); + bv.set_field(58..64, 0b000010); // 0x80 + let mut bv = bv.subset_mut(2..58); + + for (i, instr) in sched_chunk.iter().enumerate() { + let sched = + encode_instr(Some(instr), sm, &labels, &mut encoded); + + bv.set_field(i * 8..(i + 1) * 8, sched); + } + // Encode remaining ops in chunk as NOPs + for _ in sched_chunk.len()..7 { + encode_instr(None, sm, &labels, &mut encoded); + } + encoded[sched_i] = sched_instr[0]; + encoded[sched_i + 1] = sched_instr[1]; + } + } + + encoded }