diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index d978744bcee..f2b5b3c4024 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2534,6 +2534,155 @@ impl<'a> ShaderFromNir<'a> { }); self.set_dst(&intrin.def, dst); } + nir_intrinsic_suclamp_nv => { + let coords = self.get_src(&srcs[0]); + let params = self.get_src(&srcs[1]); + + let flags = intrin.flags(); + let flags: nak_nir_suclamp_flags = + unsafe { std::mem::transmute_copy(&flags) }; + + let mode = match flags.mode() { + NAK_SUCLAMP_MODE_BLOCK_LINEAR => SuClampMode::BlockLinear, + NAK_SUCLAMP_MODE_PITCH_LINEAR => SuClampMode::PitchLinear, + NAK_SUCLAMP_MODE_STORED_DESCRIPTOR => { + SuClampMode::StoredInDescriptor + } + _ => panic!("Invalid suclamp mode"), + }; + + let round = match flags.round() { + NAK_SUCLAMP_ROUND_R1 => SuClampRound::R1, + NAK_SUCLAMP_ROUND_R2 => SuClampRound::R2, + NAK_SUCLAMP_ROUND_R4 => SuClampRound::R4, + NAK_SUCLAMP_ROUND_R8 => SuClampRound::R8, + NAK_SUCLAMP_ROUND_R16 => SuClampRound::R16, + _ => panic!("Invalid suclamp round"), + }; + + let dst = b.alloc_ssa(RegFile::GPR); + let out_of_bounds = b.alloc_ssa(RegFile::Pred); + b.push_op(OpSuClamp { + dst: dst.into(), + out_of_bounds: out_of_bounds.into(), + coords, + params, + mode, + round, + is_2d: flags.is_2d(), + is_s32: flags.is_s32(), + imm: 0, + }); + let final_dst = + vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())]; + + self.set_ssa(&intrin.def, final_dst); + } + nir_intrinsic_subfm_nv => { + let x = self.get_src(&srcs[0]); + let y = self.get_src(&srcs[1]); + let z = self.get_src(&srcs[2]); + let is_3d = intrin.flags() != 0; + + let dst = b.alloc_ssa(RegFile::GPR); + let out_of_bounds = b.alloc_ssa(RegFile::Pred); + b.push_op(OpSuBfm { + dst: dst.into(), + pdst: out_of_bounds.into(), + srcs: [x, y, z], + is_3d, + }); + let final_dst = + vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())]; + + self.set_ssa(&intrin.def, final_dst); + } + nir_intrinsic_sueau_nv => { + let off = self.get_src(&srcs[0]); + let bit_field = self.get_src(&srcs[1]); + let addr = self.get_src(&srcs[2]); + + let dst = b.alloc_ssa(RegFile::GPR); + b.push_op(OpSuEau { + dst: dst.into(), + off, + bit_field, + addr, + }); + self.set_dst(&intrin.def, dst.into()); + } + nir_intrinsic_imadsp_nv => { + let src0 = self.get_src(&srcs[0]); + let src1 = self.get_src(&srcs[1]); + let src2 = self.get_src(&srcs[2]); + + let flags = intrin.flags(); + let flags: nak_nir_imadsp_flags = + unsafe { std::mem::transmute_copy(&flags) }; + + let translate_src_type = |s| { + use IMadSpSrcType::*; + match s { + NAK_IMAD_TYPE_U32 => U32, + NAK_IMAD_TYPE_U24 => U24, + NAK_IMAD_TYPE_U16_LO => U16Lo, + NAK_IMAD_TYPE_U16_HI => U16Hi, + NAK_IMAD_TYPE_S32 => S32, + NAK_IMAD_TYPE_S24 => S24, + NAK_IMAD_TYPE_S16_LO => S16Lo, + NAK_IMAD_TYPE_S16_HI => S16Hi, + _ => panic!("Invalid imadsp mode"), + } + }; + + let mode = if flags.params_from_src1() { + IMadSpMode::FromSrc1 + } else { + IMadSpMode::Explicit([ + translate_src_type(flags.src0()), + translate_src_type(flags.src1()), + translate_src_type(flags.src2()), + ]) + }; + + let dst = b.alloc_ssa(RegFile::GPR); + b.push_op(OpIMadSp { + srcs: [src0, src1, src2], + dst: dst.into(), + mode, + }); + self.set_dst(&intrin.def, dst.into()); + } + nir_intrinsic_suldga_nv => { + let addr = self.get_src(&srcs[0]); + let format = self.get_src(&srcs[1]); + let out_of_bounds = self.get_src(&srcs[2]); + + let comps = intrin.num_components; + + assert!(intrin.def.bit_size() == 32); + let mem_type = self.get_image_mem_type(intrin); + + let flags = intrin.flags(); + let offset_mode = match flags { + NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32, + NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32, + NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8, + NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8, + _ => panic!("Invalid suldga flags"), + }; + + let dst = b.alloc_ssa_vec(RegFile::GPR, comps); + b.push_op(OpSuLdGa { + dst: dst.clone().into(), + addr, + format, + out_of_bounds, + mem_type, + offset_mode, + }); + self.set_dst(&intrin.def, dst); + } nir_intrinsic_bindless_image_load | nir_intrinsic_bindless_image_load_raw_nv => { let handle = self.get_src(&srcs[0]); @@ -2624,6 +2773,33 @@ impl<'a> ShaderFromNir<'a> { self.set_ssa(&intrin.def, final_dst); } + nir_intrinsic_sustga_nv => { + let addr = self.get_src(&srcs[0]); + let format = self.get_src(&srcs[1]); + let out_of_bounds = self.get_src(&srcs[2]); + + let data = self.get_src(&srcs[3]); + let image_access = + ImageAccess::Formatted(ChannelMask::new(0xf)); + + let flags = intrin.flags(); + let offset_mode = match flags { + NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32, + NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32, + NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8, + NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8, + _ => panic!("Invalid sustga flags"), + }; + + b.push_op(OpSuStGa { + addr, + format, + data, + out_of_bounds, + image_access, + offset_mode, + }); + } nir_intrinsic_bindless_image_store => { let handle = self.get_src(&srcs[0]); let dim = self.get_image_dim(intrin); diff --git a/src/nouveau/compiler/nak/hw_tests.rs b/src/nouveau/compiler/nak/hw_tests.rs index fef6c54d8b8..14ca3b30c7c 100644 --- a/src/nouveau/compiler/nak/hw_tests.rs +++ b/src/nouveau/compiler/nak/hw_tests.rs @@ -1180,6 +1180,126 @@ fn test_iadd64() { } } +#[test] +fn test_op_suclamp() { + if !RunSingleton::get().sm.is_kepler() { + return; + } + + // We cannot test every single combination of options. + // Use a random generator for rounding and immediate + let mut a = Acorn::new(); + for mode in [ + SuClampMode::StoredInDescriptor, + SuClampMode::PitchLinear, + SuClampMode::BlockLinear, + ] { + for i in 0..4 { + let is_s32 = (i & (1 << 0)) != 0; + let is_2d = (i & (1 << 1)) != 0; + // immediate is an i6 value + let imm = (a.get_u32() % 64) as i8 - 32; + let round = match a.get_u32() % 5 { + 0 => SuClampRound::R1, + 1 => SuClampRound::R2, + 2 => SuClampRound::R4, + 3 => SuClampRound::R8, + _ => SuClampRound::R16, + }; + + let op = OpSuClamp { + dst: Dst::None, + out_of_bounds: Dst::None, + mode, + round, + is_s32, + is_2d, + coords: 0.into(), + params: 0.into(), + imm, + }; + + test_foldable_op(op); + } + } +} + +#[test] +fn test_op_subfm() { + if !RunSingleton::get().sm.is_kepler() { + return; + } + + for is_3d in [false, true] { + let op = OpSuBfm { + dst: Dst::None, + pdst: Dst::None, + srcs: [0.into(), 0.into(), 0.into()], + is_3d, + }; + + test_foldable_op(op); + } +} + +#[test] +fn test_op_sueau() { + if !RunSingleton::get().sm.is_kepler() { + return; + } + + let op = OpSuEau { + dst: Dst::None, + off: 0.into(), + bit_field: 0.into(), + addr: 0.into(), + }; + + test_foldable_op(op); +} + +#[test] +fn test_op_imadsp() { + if !RunSingleton::get().sm.is_kepler() { + return; + } + + use IMadSpSrcType::*; + let src0_w = [U32, U24, U16Lo, U16Hi]; + let src1_w = [U24, U16Lo]; + let src2_w = [U32, U24, U16Lo]; + + let mut modes = vec![]; + + // Cartesian product + for w0 in src0_w { + for w1 in src1_w { + for w2 in src2_w { + for sign in 0..4 { + let s0 = (sign & 0x1) != 0; + let s1 = (sign & 0x2) != 0; + let s2 = s0 || s1; + modes.push(IMadSpMode::Explicit([ + w0.with_sign(s0), + w1.with_sign(s1), + w2.with_sign(s2), + ])) + } + } + } + } + modes.push(IMadSpMode::FromSrc1); + + for mode in modes { + let op = OpIMadSp { + dst: Dst::None, + srcs: [0.into(), 0.into(), 0.into()], + mode, + }; + test_foldable_op(op); + } +} + #[test] fn test_ineg64() { let run = RunSingleton::get(); diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index e9db804ecc0..c813a0dd62f 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -4,7 +4,7 @@ extern crate bitview; extern crate nak_ir_proc; -use bitview::{BitMutView, BitView}; +use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField}; use nak_bindings::*; pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder}; @@ -5183,6 +5183,669 @@ impl DisplayOp for OpSuAtom { } impl_display_for_op!(OpSuAtom); +#[derive(Clone, Copy)] +pub enum SuClampMode { + StoredInDescriptor, + PitchLinear, + BlockLinear, +} + +impl fmt::Display for SuClampMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + SuClampMode::StoredInDescriptor => ".sd", + SuClampMode::PitchLinear => ".pl", + SuClampMode::BlockLinear => ".bl", + }; + write!(f, "{}", s) + } +} + +#[derive(Clone, Copy)] +pub enum SuClampRound { + R1, + R2, + R4, + R8, + R16, +} + +impl SuClampRound { + pub fn to_int(&self) -> u8 { + match self { + SuClampRound::R1 => 1, + SuClampRound::R2 => 2, + SuClampRound::R4 => 4, + SuClampRound::R8 => 8, + SuClampRound::R16 => 16, + } + } + + pub fn to_mask(&self) -> u32 { + !(self.to_int() as u32 - 1) + } +} + +impl fmt::Display for SuClampRound { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, ".r{}", self.to_int()) + } +} + +/// Kepler only +/// Surface Clamp +/// +/// Can clamp coordinates of surface operations in a 0..=clamp inclusive +/// range. It also computes other information useful to compute the +/// real address of an element within an image for both block-lienar and +/// pitch-linear layouts. We can also reduce this operation to a "stupid" +/// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false +/// this will not compute any extra operations and is useful to clamp array +/// indexes. +/// +/// Since the shader code does not know if an image layout is block-linear +/// or pitch-linear, this opcode must be able to do both, the operation +/// is then selected by the "clamp" bitfield, usually read from a descriptor. +/// In block-linear mode we divide the bits that will compute the higher +/// part and the lower part. +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice, Clone)] +pub struct OpSuClamp { + #[dst_type(GPR)] + pub dst: Dst, + #[dst_type(Pred)] + pub out_of_bounds: Dst, + + /// This modifier specifies if we use pitch-linear or block-linear + /// calculations, another option is to support both and read the actual + /// format from the clamp (shader code doesn't always know if an image + /// layout). + /// When mode=pitch_linear and is_2d=false the suclamp op enters a + /// simpler "plain" mode where it only performs clamping and the output + /// register doesn't contain any information bits about pitch-linear or + /// block-linear calculations + pub mode: SuClampMode, + /// Strangely enough, "round" just rounds the clamp, not the source + /// this does not help at all with clamping coordinates. + /// It could be useful when clamping raw addresses of a multi-byte read. + /// ex: if we read 4 bytes at once, and the buffer length is 16, + /// the bounds will be 15 (they are inclusive), but if we read + /// at address 15 we would read bytes 15..19, so we are out of range. + /// if we clamp tthe bounds to R4 the effective bound becomes 12 + /// so the read will be performed from 12..16, remaining in bounds. + pub round: SuClampRound, + pub is_s32: bool, + pub is_2d: bool, + + #[src_type(GPR)] + pub coords: Src, + + /// Packed parameter containing both bounds (inclusive) + /// and other information (explained in more details in Foldable): + /// 0..20: bound (inclusive) + /// 21: pitch_linear (used if mode == StoredInDescriptor) + /// 22..26: coord shl + /// 26..29: coord shr + /// 29..32: n. of tiles + #[src_type(ALU)] + pub params: Src, + /// Added to the coords, it's only an i6 + pub imm: i8, +} + +impl Foldable for OpSuClamp { + fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { + let src = f.get_u32_src(self, &self.coords); + let params = f.get_u32_src(self, &self.params); + let imm = self.imm; // i6 + + let src = if self.is_s32 { + (src as i32) as i64 + } else { + src as i64 + }; + let src = src + (imm as i64); + + let params_bv = BitView::new(¶ms); + let pitch_linear = match self.mode { + SuClampMode::StoredInDescriptor => params_bv.get_bit(21), + SuClampMode::PitchLinear => true, + SuClampMode::BlockLinear => false, + }; + + let bounds = if pitch_linear && !self.is_2d { + params + } else { + params_bv.get_bit_range_u64(0..20) as u32 + }; + + let bounds = bounds & self.round.to_mask(); + let (is_oob, clamped) = if src < 0 { + (true, 0) + } else if src > (bounds as i64) { + (true, bounds) + } else { + (false, src as u32) + }; + + let mut out = 0u32; + let mut bv = BitMutView::new(&mut out); + if pitch_linear { + if !self.is_2d { + // simple clamp mode, NO BITFIELD + bv.set_field(0..32, clamped); + } else { + // Real, pitch_linear mode + bv.set_field(0..20, clamped & 0xfffff); + + // Pass through el_size_log2 + bv.set_field(27..30, params_bv.get_bit_range_u64(26..29)); + bv.set_bit(30, true); // pitch_linear=true + bv.set_bit(31, is_oob); + } + } else { + // Block linear + + // Number of bits to discard for GoB coordinates + let shr_a = params_bv.get_bit_range_u64(22..26) as u8; + // Block coords + bv.set_field(0..16, (clamped >> shr_a) & 0xffff); + + // Shift applied to coords, always zero except for x. + // (for coord x=1 and format R32, we want to access byte 4) + // e.g. R8 -> 0, R32 -> 2, 128 -> 4 + let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8; + // Coord inside GoB (element space) + bv.set_field(16..24, (clamped << el_size_log2) & 0xff); + + // Useful later to compute gob-space coords. + let n_tiles = params_bv.get_bit_range_u64(29..32) as u8; + bv.set_field(27..30, n_tiles); + bv.set_bit(30, false); // pitch_linear=false + bv.set_bit(31, is_oob); + } + f.set_u32_dst(self, &self.dst, out); + f.set_pred_dst(self, &self.out_of_bounds, is_oob); + } +} + +impl DisplayOp for OpSuClamp { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "suclamp{}", self.mode)?; + if !matches!(self.round, SuClampRound::R1) { + write!(f, "{}", self.round)?; + } + if !self.is_s32 { + write!(f, ".u32")?; + } + if !self.is_2d { + write!(f, ".1d")?; + } + + write! {f, " {} {} {:x}", self.coords, self.params, self.imm} + } +} +impl_display_for_op!(OpSuClamp); + +/// Kepler only +/// BitField Merge +/// +/// The resulting bit-field is composed of a high-part 8..32 that is merged +/// with the address by sueau, and a lower-part 0..8 that is provided +/// directly to suldga/sustga and defines the lower offset of the glonal array. +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice, Clone)] +pub struct OpSuBfm { + #[dst_type(GPR)] + pub dst: Dst, + #[dst_type(Pred)] + pub pdst: Dst, + + /// x, y, z + #[src_type(ALU)] + pub srcs: [Src; 3], + /// When is_3d=false the third source is ignored, but still used in + /// pitch-linear computation. + pub is_3d: bool, +} + +impl Foldable for OpSuBfm { + fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { + let x_raw = f.get_u32_src(self, &self.srcs[0]); + let y_raw = f.get_u32_src(self, &self.srcs[1]); + let z_raw = f.get_u32_src(self, &self.srcs[2]); + + let x = BitView::new(&x_raw); + let y = BitView::new(&y_raw); + let z = BitView::new(&z_raw); + + let mut o_raw = 0u32; + let mut o = BitMutView::new(&mut o_raw); + + let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30); + + if !is_pitch_linear_2d { + // Copy coordinates inside of GoB space. + // They are 6 bits from x and 3 from y (GoB is 64x8 bytes). + // Bits from 0..8 are ignored by sueau and are used directly + // by suldga/sustga. + // Bit 9 will become the first bit of the higher part in + // sueau. + o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20)); + + // Address calculation inside of GoB should virtually be + // y * 64 + x * element_size (each row is linear). + // So why are those bits swizzled like so? + // I have no idea, but these are correct even for atomics + // that accept real addresses. + o.set_bit(4, y.get_bit(16)); + o.set_bit(5, y.get_bit(17)); + o.set_bit(6, x.get_bit(20)); + o.set_bit(7, y.get_bit(18)); + + o.set_bit(8, x.get_bit(21)); + // 9..11: 0 + + // -------------- Tiles -------------- + // Number of tiles log2 + let ntx = x.get_bit_range_u64(27..30) & 0x1; + let nty = y.get_bit_range_u64(27..30); + let ntz = z.get_bit_range_u64(27..30); + let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false + + // Computes how many bits to dedicate to GoB coords inside + // a block + o.set_field(12..16, ntx + nty + ntz); + + // Coords in gob_space. + // Remove 6 bits from x and 3 bits from y, those are used + // as element coords in GoB space. + let a = x.get_bit_range_u64(22..24); // 1100_0000 + let b = y.get_bit_range_u64(19..24); // 1111_1000 + let c = z.get_bit_range_u64(16..24); // 1111_1111 + + // nt* indicates how many bits to consider (max 5) + let a = a & ((1 << ntx) - 1); + let b = b & ((1 << nty.min(5)) - 1); + let c = c & ((1 << ntz.min(5)) - 1); + + // Compute gob offset + // We can just or together at certain offsets because + // Tiles are always powers of two in each direction. + // z || y || x (LSB) + let res = c; + let res = (res << nty) | b; + let res = (res << ntx) | a; + let mask = match ntx { + 0 => 0x3ff, + _ => 0x7ff, + }; + + // gob coords will be put before the block coords in + // sueau. + o.set_field(16..27, res & mask); + } else { + let d = z.get_bit_range_u64(0..8); + let el_size_log2 = x.get_bit_range_u64(27..30); + o.set_field(0..8, (d << el_size_log2) & 0xff); + // 9..11: 0 + o.set_field(12..15, el_size_log2); + } + + o.set_bit(11, is_pitch_linear_2d); + + let is_oob = + x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d); + f.set_u32_dst(self, &self.dst, o_raw); + f.set_pred_dst(self, &self.pdst, is_oob); + } +} + +impl DisplayOp for OpSuBfm { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "subfm")?; + + if self.is_3d { + write!(f, ".3d")?; + } + + write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2]) + } +} +impl_display_for_op!(OpSuBfm); + +/// Kepler only +/// Used to compute the higher 32 bits of image address using +/// the merged bitfield and the block coordinates (offset). +/// It can switch to a pitch_linear mode (bit 11 of bit-field). +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice, Clone)] +pub struct OpSuEau { + #[dst_type(GPR)] + pub dst: Dst, + + /// offset is computed from the block coordinates. + /// it's ok to add it directly to the address since they are both + /// "aligned" to 64 (the first 8 bits are removed from both) + #[src_type(GPR)] + pub off: Src, + + /// 8.. 9: offset, last bit + /// 11..12: pitch_linear: when enabled the bf-offset is ignored and + /// the off_shl is subtracted by 8 + /// 12..16: off_shl, shifts left the offset by off_shl + 1 + /// 16..27: 11-bit offset, when joined with the 1-bit offset completes the + /// 12-bit offset ORed to the src offset after shifting + /// (unless pitch_linear) + #[src_type(ALU)] + pub bit_field: Src, + + #[src_type(GPR)] + pub addr: Src, +} + +impl Foldable for OpSuEau { + fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { + let off_raw = f.get_u32_src(self, &self.off); + let bf_raw = f.get_u32_src(self, &self.bit_field); + let addr = f.get_u32_src(self, &self.addr); + + let bf = BitView::new(&bf_raw); + + let off1 = bf.get_bit_range_u64(8..9) as u32; + let is_pitch_linear = bf.get_bit(11); + let off_shift = bf.get_bit_range_u64(12..16) as u32; + let offs = bf.get_bit_range_u64(16..27) as u32; + + let res = if !is_pitch_linear { + // Block linear + // off_raw are the block coordinates + // to those we add gob coordinates from the merged bitfield + // and the MSB of in-gob coordinates. + let omul = off_shift + 1; + let real_off = (off_raw << omul) | (offs << 1) | off1; + addr.wrapping_add(real_off & 0x7ff_ffff) + } else { + // Add the high part of the coordinates to addr + // off << (omul - 8) + // but for negative values do a shr instead. + // In fact, off_shift will always be < 8 because pitch_linear + // subfm only assigns bits 12..15, so this is always a shr + let shl_amount = off_shift as i32 - 8; + let off = if shl_amount < 0 { + off_raw >> (-shl_amount as u32) + } else { + off_raw << (shl_amount as u32) + }; + addr.wrapping_add(off & 0xff_ffff) + }; + f.set_u32_dst(self, &self.dst, res); + } +} + +impl DisplayOp for OpSuEau { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr} + } +} +impl_display_for_op!(OpSuEau); + +#[derive(Copy, Clone, Debug)] +pub enum IMadSpSrcType { + U32, + U24, + U16Hi, + U16Lo, + S32, + S24, + S16Hi, + S16Lo, +} + +impl IMadSpSrcType { + pub fn unsigned(self) -> IMadSpSrcType { + use IMadSpSrcType::*; + match self { + S32 => U32, + S24 => U24, + S16Hi => U16Hi, + S16Lo => U16Lo, + x => x, + } + } + + #[allow(dead_code)] // Used in hw_tests + pub fn with_sign(self, sign: bool) -> Self { + use IMadSpSrcType::*; + if !sign { + return self.unsigned(); + } + match self { + U32 => S32, + U24 => S24, + U16Hi => S16Hi, + U16Lo => S16Lo, + x => x, + } + } + + pub fn sign(self) -> bool { + use IMadSpSrcType::*; + match self { + U32 | U24 | U16Hi | U16Lo => false, + S32 | S24 | S16Hi | S16Lo => true, + } + } + + fn cast(&self, v: u32) -> i64 { + use IMadSpSrcType::*; + match self { + U32 => v as i64, + U24 => (v & 0x00ff_ffff) as i64, + U16Lo => (v as u16) as i64, + U16Hi => (v >> 16) as i64, + S32 => (v as i32) as i64, + S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend + S16Lo => (v as i16) as i64, + S16Hi => ((v >> 16) as i16) as i64, + } + } +} + +impl fmt::Display for IMadSpSrcType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let sign = if self.sign() { ".s" } else { ".u" }; + let width = match self.unsigned() { + IMadSpSrcType::U32 => "32", + IMadSpSrcType::U24 => "24", + IMadSpSrcType::U16Lo => "16h0", + IMadSpSrcType::U16Hi => "16h1", + _ => unreachable!(), + }; + write!(f, "{}{}", sign, width) + } +} + +#[derive(Clone, Copy, Debug)] +pub enum IMadSpMode { + Explicit([IMadSpSrcType; 3]), + // Parameters are loaded from src1 bits 26..32 + FromSrc1, +} + +impl fmt::Display for IMadSpMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"), + IMadSpMode::FromSrc1 => write!(f, ".sd"), + } + } +} + +/// Kepler only +/// Extracted Integer Multiply and Add. +/// It does the same operation as an imad op, but it can extract the +/// sources from a subset of the register (only 32, 24 or 16 bits). +/// It can also do a "load parameters" mode where the modifiers are +/// loaded from the higher bits in src2 (check Foldable impl for details). +/// Limits: src1 can never be U32 or U16Hi, +/// src2 can never be U16Hi +/// src2 signedness is tied to src1 and src0 signedness, +/// if either is signed, src2 must be signed too. +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice, Clone)] +pub struct OpIMadSp { + #[dst_type(GPR)] + pub dst: Dst, + + #[src_type(ALU)] + pub srcs: [Src; 3], + + pub mode: IMadSpMode, +} + +impl Foldable for OpIMadSp { + fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) { + let src0 = f.get_u32_src(self, &self.srcs[0]); + let src1 = f.get_u32_src(self, &self.srcs[1]); + let src2 = f.get_u32_src(self, &self.srcs[2]); + + let (src_type0, src_type1, src_type2) = match self.mode { + IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2), + IMadSpMode::FromSrc1 => { + let params = BitView::new(&src1); + + let st2 = params.get_bit_range_u64(26..28) as usize; + let st1 = params.get_bit_range_u64(28..30) as usize; + let st0 = params.get_bit_range_u64(30..32) as usize; + + use IMadSpSrcType::*; + let types0 = [U32, U24, U16Lo, U16Hi]; + let types1 = [U16Lo, U24, U16Lo, U24]; + let types2 = [U32, U24, U16Lo, U32]; + + ( + types0[st0].unsigned(), + types1[st1].unsigned(), + types2[st2].unsigned(), + ) + } + }; + + let src0 = src_type0.cast(src0); + let src1 = src_type1.cast(src1); + let src2 = src_type2.cast(src2); + + f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32); + } +} + +impl DisplayOp for OpIMadSp { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "imadsp{} {} {} {}", + self.mode, self.srcs[0], self.srcs[1], self.srcs[2] + ) + } +} +impl_display_for_op!(OpIMadSp); + +/// In SuGa ops, the address is always specified in two parts, the higher +/// part contains the base address without the lower 8 bits (base_addr >> 8), +/// while the lower part might contain either the missing 8 bits (U8) or +/// a full 32-bit offset that must not be shifted (U32). +/// +/// In short: +/// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF) +/// U32: real_address = (addr_hi << 8) + addr_lo +/// The signed variants do the same but with sign extension probably +#[derive(Clone, Copy)] +pub enum SuGaOffsetMode { + U32, + S32, + U8, + S8, +} + +/// Kepler only +/// Load a pixel from an image, takes the pixel address and format as an +/// argument. Since the image coordinates are not present, the instruction +/// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1) +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpSuLdGa { + pub dst: Dst, + + pub mem_type: MemType, + pub offset_mode: SuGaOffsetMode, + + /// Format for the loaded data, passed directly from the descriptor. + #[src_type(GPR)] + pub format: Src, + + /// This is not an address, but it's two registers that contain + /// [addr >> 8, addr & 0xff]. + /// This works because addr >> 8 is 32-bits (GOB-aligned) and the + /// rest 8-bits are extracted by the bit-field + /// It's useful since in block-linear mode the lower bits and the higher + /// bits are computed in different ways. + #[src_type(SSA)] + pub addr: Src, + + #[src_type(Pred)] + pub out_of_bounds: Src, +} + +impl DisplayOp for OpSuLdGa { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "suldga{} [{}] {} {}", + self.mem_type, self.addr, self.format, self.out_of_bounds + ) + } +} +impl_display_for_op!(OpSuLdGa); + +/// Kepler only +/// Store a pixel in an image, takes the pixel address and format as an +/// argument. Since the image coordinates are not present, the instruction +/// also needs an `out_of_bounds` predicate, when true, stores are ingored +#[repr(C)] +#[derive(SrcsAsSlice, DstsAsSlice)] +pub struct OpSuStGa { + pub image_access: ImageAccess, + pub offset_mode: SuGaOffsetMode, + + #[src_type(GPR)] + pub format: Src, + + #[src_type(SSA)] + pub addr: Src, + + #[src_type(SSA)] + pub data: Src, + + #[src_type(Pred)] + pub out_of_bounds: Src, +} + +impl DisplayOp for OpSuStGa { + fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "sustga{} [{}] {} {} {}", + self.image_access, + self.addr, + self.format, + self.data, + self.out_of_bounds, + ) + } +} +impl_display_for_op!(OpSuStGa); + #[repr(C)] #[derive(SrcsAsSlice, DstsAsSlice)] pub struct OpLd { @@ -6676,6 +7339,12 @@ pub enum Op { SuLd(OpSuLd), SuSt(OpSuSt), SuAtom(OpSuAtom), + SuClamp(OpSuClamp), + SuBfm(OpSuBfm), + SuEau(OpSuEau), + IMadSp(OpIMadSp), + SuLdGa(OpSuLdGa), + SuStGa(OpSuStGa), Ld(OpLd), Ldc(OpLdc), LdSharedLock(OpLdSharedLock), @@ -6805,6 +7474,10 @@ impl Op { | Op::LeaX(_) | Op::Lop2(_) | Op::Lop3(_) + | Op::SuClamp(_) + | Op::SuBfm(_) + | Op::SuEau(_) + | Op::IMadSp(_) | Op::Shf(_) | Op::Shl(_) | Op::Shr(_) @@ -6834,7 +7507,11 @@ impl Op { | Op::Txq(_) => false, // Surface ops - Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => false, + Op::SuLd(_) + | Op::SuSt(_) + | Op::SuAtom(_) + | Op::SuLdGa(_) + | Op::SuStGa(_) => false, // Memory ops Op::Ld(_) @@ -7232,7 +7909,11 @@ impl Instr { Op::Atom(op) => op.mem_space != MemSpace::Local, Op::Ld(op) => op.access.space != MemSpace::Local, Op::St(op) => op.access.space != MemSpace::Local, - Op::SuAtom(_) | Op::SuLd(_) | Op::SuSt(_) => true, + Op::SuAtom(_) + | Op::SuLd(_) + | Op::SuSt(_) + | Op::SuLdGa(_) + | Op::SuStGa(_) => true, _ => false, } } @@ -7241,7 +7922,7 @@ impl Instr { match &self.op { Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)), Op::St(op) => matches!(op.access.space, MemSpace::Global(_)), - Op::SuAtom(_) | Op::SuSt(_) => true, + Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true, _ => false, } } @@ -7250,6 +7931,7 @@ impl Instr { match &self.op { Op::ASt(_) | Op::SuSt(_) + | Op::SuStGa(_) | Op::SuAtom(_) | Op::LdSharedLock(_) | Op::St(_) diff --git a/src/nouveau/compiler/nak/opt_instr_sched_common.rs b/src/nouveau/compiler/nak/opt_instr_sched_common.rs index 92802205ddc..27046a671fa 100644 --- a/src/nouveau/compiler/nak/opt_instr_sched_common.rs +++ b/src/nouveau/compiler/nak/opt_instr_sched_common.rs @@ -129,6 +129,10 @@ pub fn side_effect_type(op: &Op) -> SideEffect { | Op::LeaX(_) | Op::Lop2(_) | Op::Lop3(_) + | Op::SuClamp(_) + | Op::SuBfm(_) + | Op::SuEau(_) + | Op::IMadSp(_) | Op::Shf(_) | Op::Shl(_) | Op::Shr(_) @@ -158,7 +162,11 @@ pub fn side_effect_type(op: &Op) -> SideEffect { | Op::Txq(_) => SideEffect::Memory, // Surface ops - Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory, + Op::SuLd(_) + | Op::SuSt(_) + | Op::SuAtom(_) + | Op::SuLdGa(_) + | Op::SuStGa(_) => SideEffect::Memory, // Memory ops Op::Ipa(_) | Op::Ldc(_) => SideEffect::None, @@ -262,7 +270,11 @@ pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 { | Op::Txq(_) => 32, // Surface ops - Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32, + Op::SuLd(_) + | Op::SuSt(_) + | Op::SuAtom(_) + | Op::SuLdGa(_) + | Op::SuStGa(_) => 32, // Memory ops Op::Ldc(_) => 4,