nak: Add surface address ops

Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34975>
2026-05-08 04:48:08 +02:00 · 2025-05-23 16:01:32 +02:00 · 2025-05-23 16:01:32 +02:00 · ac3fd5768b
commit ac3fd5768b
parent 82d789d22a
4 changed files with 996 additions and 6 deletions
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@ -2534,6 +2534,155 @@ impl<'a> ShaderFromNir<'a> {
                });
                self.set_dst(&intrin.def, dst);
            }
+            nir_intrinsic_suclamp_nv => {
+                let coords = self.get_src(&srcs[0]);
+                let params = self.get_src(&srcs[1]);
+
+                let flags = intrin.flags();
+                let flags: nak_nir_suclamp_flags =
+                    unsafe { std::mem::transmute_copy(&flags) };
+
+                let mode = match flags.mode() {
+                    NAK_SUCLAMP_MODE_BLOCK_LINEAR => SuClampMode::BlockLinear,
+                    NAK_SUCLAMP_MODE_PITCH_LINEAR => SuClampMode::PitchLinear,
+                    NAK_SUCLAMP_MODE_STORED_DESCRIPTOR => {
+                        SuClampMode::StoredInDescriptor
+                    }
+                    _ => panic!("Invalid suclamp mode"),
+                };
+
+                let round = match flags.round() {
+                    NAK_SUCLAMP_ROUND_R1 => SuClampRound::R1,
+                    NAK_SUCLAMP_ROUND_R2 => SuClampRound::R2,
+                    NAK_SUCLAMP_ROUND_R4 => SuClampRound::R4,
+                    NAK_SUCLAMP_ROUND_R8 => SuClampRound::R8,
+                    NAK_SUCLAMP_ROUND_R16 => SuClampRound::R16,
+                    _ => panic!("Invalid suclamp round"),
+                };
+
+                let dst = b.alloc_ssa(RegFile::GPR);
+                let out_of_bounds = b.alloc_ssa(RegFile::Pred);
+                b.push_op(OpSuClamp {
+                    dst: dst.into(),
+                    out_of_bounds: out_of_bounds.into(),
+                    coords,
+                    params,
+                    mode,
+                    round,
+                    is_2d: flags.is_2d(),
+                    is_s32: flags.is_s32(),
+                    imm: 0,
+                });
+                let final_dst =
+                    vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
+
+                self.set_ssa(&intrin.def, final_dst);
+            }
+            nir_intrinsic_subfm_nv => {
+                let x = self.get_src(&srcs[0]);
+                let y = self.get_src(&srcs[1]);
+                let z = self.get_src(&srcs[2]);
+                let is_3d = intrin.flags() != 0;
+
+                let dst = b.alloc_ssa(RegFile::GPR);
+                let out_of_bounds = b.alloc_ssa(RegFile::Pred);
+                b.push_op(OpSuBfm {
+                    dst: dst.into(),
+                    pdst: out_of_bounds.into(),
+                    srcs: [x, y, z],
+                    is_3d,
+                });
+                let final_dst =
+                    vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
+
+                self.set_ssa(&intrin.def, final_dst);
+            }
+            nir_intrinsic_sueau_nv => {
+                let off = self.get_src(&srcs[0]);
+                let bit_field = self.get_src(&srcs[1]);
+                let addr = self.get_src(&srcs[2]);
+
+                let dst = b.alloc_ssa(RegFile::GPR);
+                b.push_op(OpSuEau {
+                    dst: dst.into(),
+                    off,
+                    bit_field,
+                    addr,
+                });
+                self.set_dst(&intrin.def, dst.into());
+            }
+            nir_intrinsic_imadsp_nv => {
+                let src0 = self.get_src(&srcs[0]);
+                let src1 = self.get_src(&srcs[1]);
+                let src2 = self.get_src(&srcs[2]);
+
+                let flags = intrin.flags();
+                let flags: nak_nir_imadsp_flags =
+                    unsafe { std::mem::transmute_copy(&flags) };
+
+                let translate_src_type = |s| {
+                    use IMadSpSrcType::*;
+                    match s {
+                        NAK_IMAD_TYPE_U32 => U32,
+                        NAK_IMAD_TYPE_U24 => U24,
+                        NAK_IMAD_TYPE_U16_LO => U16Lo,
+                        NAK_IMAD_TYPE_U16_HI => U16Hi,
+                        NAK_IMAD_TYPE_S32 => S32,
+                        NAK_IMAD_TYPE_S24 => S24,
+                        NAK_IMAD_TYPE_S16_LO => S16Lo,
+                        NAK_IMAD_TYPE_S16_HI => S16Hi,
+                        _ => panic!("Invalid imadsp mode"),
+                    }
+                };
+
+                let mode = if flags.params_from_src1() {
+                    IMadSpMode::FromSrc1
+                } else {
+                    IMadSpMode::Explicit([
+                        translate_src_type(flags.src0()),
+                        translate_src_type(flags.src1()),
+                        translate_src_type(flags.src2()),
+                    ])
+                };
+
+                let dst = b.alloc_ssa(RegFile::GPR);
+                b.push_op(OpIMadSp {
+                    srcs: [src0, src1, src2],
+                    dst: dst.into(),
+                    mode,
+                });
+                self.set_dst(&intrin.def, dst.into());
+            }
+            nir_intrinsic_suldga_nv => {
+                let addr = self.get_src(&srcs[0]);
+                let format = self.get_src(&srcs[1]);
+                let out_of_bounds = self.get_src(&srcs[2]);
+
+                let comps = intrin.num_components;
+
+                assert!(intrin.def.bit_size() == 32);
+                let mem_type = self.get_image_mem_type(intrin);
+
+                let flags = intrin.flags();
+                let offset_mode = match flags {
+                    NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
+                    NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
+                    NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
+                    NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
+                    _ => panic!("Invalid suldga flags"),
+                };
+
+                let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
+                b.push_op(OpSuLdGa {
+                    dst: dst.clone().into(),
+                    addr,
+                    format,
+                    out_of_bounds,
+                    mem_type,
+                    offset_mode,
+                });
+                self.set_dst(&intrin.def, dst);
+            }
            nir_intrinsic_bindless_image_load
            | nir_intrinsic_bindless_image_load_raw_nv => {
                let handle = self.get_src(&srcs[0]);
@ -2624,6 +2773,33 @@ impl<'a> ShaderFromNir<'a> {

                self.set_ssa(&intrin.def, final_dst);
            }
+            nir_intrinsic_sustga_nv => {
+                let addr = self.get_src(&srcs[0]);
+                let format = self.get_src(&srcs[1]);
+                let out_of_bounds = self.get_src(&srcs[2]);
+
+                let data = self.get_src(&srcs[3]);
+                let image_access =
+                    ImageAccess::Formatted(ChannelMask::new(0xf));
+
+                let flags = intrin.flags();
+                let offset_mode = match flags {
+                    NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
+                    NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
+                    NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
+                    NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
+                    _ => panic!("Invalid sustga flags"),
+                };
+
+                b.push_op(OpSuStGa {
+                    addr,
+                    format,
+                    data,
+                    out_of_bounds,
+                    image_access,
+                    offset_mode,
+                });
+            }
            nir_intrinsic_bindless_image_store => {
                let handle = self.get_src(&srcs[0]);
                let dim = self.get_image_dim(intrin);
--- a/src/nouveau/compiler/nak/hw_tests.rs
+++ b/src/nouveau/compiler/nak/hw_tests.rs
@ -1180,6 +1180,126 @@ fn test_iadd64() {
    }
 }

+#[test]
+fn test_op_suclamp() {
+    if !RunSingleton::get().sm.is_kepler() {
+        return;
+    }
+
+    // We cannot test every single combination of options.
+    // Use a random generator for rounding and immediate
+    let mut a = Acorn::new();
+    for mode in [
+        SuClampMode::StoredInDescriptor,
+        SuClampMode::PitchLinear,
+        SuClampMode::BlockLinear,
+    ] {
+        for i in 0..4 {
+            let is_s32 = (i & (1 << 0)) != 0;
+            let is_2d = (i & (1 << 1)) != 0;
+            // immediate is an i6 value
+            let imm = (a.get_u32() % 64) as i8 - 32;
+            let round = match a.get_u32() % 5 {
+                0 => SuClampRound::R1,
+                1 => SuClampRound::R2,
+                2 => SuClampRound::R4,
+                3 => SuClampRound::R8,
+                _ => SuClampRound::R16,
+            };
+
+            let op = OpSuClamp {
+                dst: Dst::None,
+                out_of_bounds: Dst::None,
+                mode,
+                round,
+                is_s32,
+                is_2d,
+                coords: 0.into(),
+                params: 0.into(),
+                imm,
+            };
+
+            test_foldable_op(op);
+        }
+    }
+}
+
+#[test]
+fn test_op_subfm() {
+    if !RunSingleton::get().sm.is_kepler() {
+        return;
+    }
+
+    for is_3d in [false, true] {
+        let op = OpSuBfm {
+            dst: Dst::None,
+            pdst: Dst::None,
+            srcs: [0.into(), 0.into(), 0.into()],
+            is_3d,
+        };
+
+        test_foldable_op(op);
+    }
+}
+
+#[test]
+fn test_op_sueau() {
+    if !RunSingleton::get().sm.is_kepler() {
+        return;
+    }
+
+    let op = OpSuEau {
+        dst: Dst::None,
+        off: 0.into(),
+        bit_field: 0.into(),
+        addr: 0.into(),
+    };
+
+    test_foldable_op(op);
+}
+
+#[test]
+fn test_op_imadsp() {
+    if !RunSingleton::get().sm.is_kepler() {
+        return;
+    }
+
+    use IMadSpSrcType::*;
+    let src0_w = [U32, U24, U16Lo, U16Hi];
+    let src1_w = [U24, U16Lo];
+    let src2_w = [U32, U24, U16Lo];
+
+    let mut modes = vec![];
+
+    // Cartesian product
+    for w0 in src0_w {
+        for w1 in src1_w {
+            for w2 in src2_w {
+                for sign in 0..4 {
+                    let s0 = (sign & 0x1) != 0;
+                    let s1 = (sign & 0x2) != 0;
+                    let s2 = s0 || s1;
+                    modes.push(IMadSpMode::Explicit([
+                        w0.with_sign(s0),
+                        w1.with_sign(s1),
+                        w2.with_sign(s2),
+                    ]))
+                }
+            }
+        }
+    }
+    modes.push(IMadSpMode::FromSrc1);
+
+    for mode in modes {
+        let op = OpIMadSp {
+            dst: Dst::None,
+            srcs: [0.into(), 0.into(), 0.into()],
+            mode,
+        };
+        test_foldable_op(op);
+    }
+}
+
 #[test]
 fn test_ineg64() {
    let run = RunSingleton::get();
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@ -4,7 +4,7 @@
 extern crate bitview;
 extern crate nak_ir_proc;

-use bitview::{BitMutView, BitView};
+use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField};
 use nak_bindings::*;

 pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
@ -5183,6 +5183,669 @@ impl DisplayOp for OpSuAtom {
 }
 impl_display_for_op!(OpSuAtom);

+#[derive(Clone, Copy)]
+pub enum SuClampMode {
+    StoredInDescriptor,
+    PitchLinear,
+    BlockLinear,
+}
+
+impl fmt::Display for SuClampMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            SuClampMode::StoredInDescriptor => ".sd",
+            SuClampMode::PitchLinear => ".pl",
+            SuClampMode::BlockLinear => ".bl",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+#[derive(Clone, Copy)]
+pub enum SuClampRound {
+    R1,
+    R2,
+    R4,
+    R8,
+    R16,
+}
+
+impl SuClampRound {
+    pub fn to_int(&self) -> u8 {
+        match self {
+            SuClampRound::R1 => 1,
+            SuClampRound::R2 => 2,
+            SuClampRound::R4 => 4,
+            SuClampRound::R8 => 8,
+            SuClampRound::R16 => 16,
+        }
+    }
+
+    pub fn to_mask(&self) -> u32 {
+        !(self.to_int() as u32 - 1)
+    }
+}
+
+impl fmt::Display for SuClampRound {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, ".r{}", self.to_int())
+    }
+}
+
+/// Kepler only
+/// Surface Clamp
+///
+/// Can clamp coordinates of surface operations in a 0..=clamp inclusive
+/// range. It also computes other information useful to compute the
+/// real address of an element within an image for both block-lienar and
+/// pitch-linear layouts. We can also reduce this operation to a "stupid"
+/// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false
+/// this will not compute any extra operations and is useful to clamp array
+/// indexes.
+///
+/// Since the shader code does not know if an image layout is block-linear
+/// or pitch-linear, this opcode must be able to do both, the operation
+/// is then selected by the "clamp" bitfield, usually read from a descriptor.
+/// In block-linear mode we divide the bits that will compute the higher
+/// part and the lower part.
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
+pub struct OpSuClamp {
+    #[dst_type(GPR)]
+    pub dst: Dst,
+    #[dst_type(Pred)]
+    pub out_of_bounds: Dst,
+
+    /// This modifier specifies if we use pitch-linear or block-linear
+    /// calculations, another option is to support both and read the actual
+    /// format from the clamp (shader code doesn't always know if an image
+    /// layout).
+    /// When mode=pitch_linear and is_2d=false the suclamp op enters a
+    /// simpler "plain" mode where it only performs clamping and the output
+    /// register doesn't contain any information bits about pitch-linear or
+    /// block-linear calculations
+    pub mode: SuClampMode,
+    /// Strangely enough, "round" just rounds the clamp, not the source
+    /// this does not help at all with clamping coordinates.
+    /// It could be useful when clamping raw addresses of a multi-byte read.
+    /// ex: if we read 4 bytes at once, and the buffer length is 16,
+    ///     the bounds will be 15 (they are inclusive), but if we read
+    ///     at address 15 we would read bytes 15..19, so we are out of range.
+    ///     if we clamp tthe bounds to R4 the effective bound becomes 12
+    ///     so the read will be performed from 12..16, remaining in bounds.
+    pub round: SuClampRound,
+    pub is_s32: bool,
+    pub is_2d: bool,
+
+    #[src_type(GPR)]
+    pub coords: Src,
+
+    /// Packed parameter containing both bounds (inclusive)
+    /// and other information (explained in more details in Foldable):
+    /// 0..20: bound (inclusive)
+    /// 21: pitch_linear (used if mode == StoredInDescriptor)
+    /// 22..26: coord shl
+    /// 26..29: coord shr
+    /// 29..32: n. of tiles
+    #[src_type(ALU)]
+    pub params: Src,
+    /// Added to the coords, it's only an i6
+    pub imm: i8,
+}
+
+impl Foldable for OpSuClamp {
+    fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
+        let src = f.get_u32_src(self, &self.coords);
+        let params = f.get_u32_src(self, &self.params);
+        let imm = self.imm; // i6
+
+        let src = if self.is_s32 {
+            (src as i32) as i64
+        } else {
+            src as i64
+        };
+        let src = src + (imm as i64);
+
+        let params_bv = BitView::new(&params);
+        let pitch_linear = match self.mode {
+            SuClampMode::StoredInDescriptor => params_bv.get_bit(21),
+            SuClampMode::PitchLinear => true,
+            SuClampMode::BlockLinear => false,
+        };
+
+        let bounds = if pitch_linear && !self.is_2d {
+            params
+        } else {
+            params_bv.get_bit_range_u64(0..20) as u32
+        };
+
+        let bounds = bounds & self.round.to_mask();
+        let (is_oob, clamped) = if src < 0 {
+            (true, 0)
+        } else if src > (bounds as i64) {
+            (true, bounds)
+        } else {
+            (false, src as u32)
+        };
+
+        let mut out = 0u32;
+        let mut bv = BitMutView::new(&mut out);
+        if pitch_linear {
+            if !self.is_2d {
+                // simple clamp mode, NO BITFIELD
+                bv.set_field(0..32, clamped);
+            } else {
+                // Real, pitch_linear mode
+                bv.set_field(0..20, clamped & 0xfffff);
+
+                // Pass through el_size_log2
+                bv.set_field(27..30, params_bv.get_bit_range_u64(26..29));
+                bv.set_bit(30, true); // pitch_linear=true
+                bv.set_bit(31, is_oob);
+            }
+        } else {
+            // Block linear
+
+            // Number of bits to discard for GoB coordinates
+            let shr_a = params_bv.get_bit_range_u64(22..26) as u8;
+            // Block coords
+            bv.set_field(0..16, (clamped >> shr_a) & 0xffff);
+
+            // Shift applied to coords, always zero except for x.
+            // (for coord x=1 and format R32, we want to access byte 4)
+            // e.g. R8 -> 0, R32 -> 2, 128 -> 4
+            let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8;
+            // Coord inside GoB (element space)
+            bv.set_field(16..24, (clamped << el_size_log2) & 0xff);
+
+            // Useful later to compute gob-space coords.
+            let n_tiles = params_bv.get_bit_range_u64(29..32) as u8;
+            bv.set_field(27..30, n_tiles);
+            bv.set_bit(30, false); // pitch_linear=false
+            bv.set_bit(31, is_oob);
+        }
+        f.set_u32_dst(self, &self.dst, out);
+        f.set_pred_dst(self, &self.out_of_bounds, is_oob);
+    }
+}
+
+impl DisplayOp for OpSuClamp {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "suclamp{}", self.mode)?;
+        if !matches!(self.round, SuClampRound::R1) {
+            write!(f, "{}", self.round)?;
+        }
+        if !self.is_s32 {
+            write!(f, ".u32")?;
+        }
+        if !self.is_2d {
+            write!(f, ".1d")?;
+        }
+
+        write! {f, " {} {} {:x}", self.coords, self.params, self.imm}
+    }
+}
+impl_display_for_op!(OpSuClamp);
+
+/// Kepler only
+/// BitField Merge
+///
+/// The resulting bit-field is composed of a high-part 8..32 that is merged
+/// with the address by sueau, and a lower-part 0..8 that is provided
+/// directly to suldga/sustga and defines the lower offset of the glonal array.
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
+pub struct OpSuBfm {
+    #[dst_type(GPR)]
+    pub dst: Dst,
+    #[dst_type(Pred)]
+    pub pdst: Dst,
+
+    /// x, y, z
+    #[src_type(ALU)]
+    pub srcs: [Src; 3],
+    /// When is_3d=false the third source is ignored, but still used in
+    /// pitch-linear computation.
+    pub is_3d: bool,
+}
+
+impl Foldable for OpSuBfm {
+    fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
+        let x_raw = f.get_u32_src(self, &self.srcs[0]);
+        let y_raw = f.get_u32_src(self, &self.srcs[1]);
+        let z_raw = f.get_u32_src(self, &self.srcs[2]);
+
+        let x = BitView::new(&x_raw);
+        let y = BitView::new(&y_raw);
+        let z = BitView::new(&z_raw);
+
+        let mut o_raw = 0u32;
+        let mut o = BitMutView::new(&mut o_raw);
+
+        let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30);
+
+        if !is_pitch_linear_2d {
+            // Copy coordinates inside of GoB space.
+            // They are 6 bits from x and 3 from y (GoB is 64x8 bytes).
+            // Bits from 0..8 are ignored by sueau and are used directly
+            // by suldga/sustga.
+            // Bit 9 will become the first bit of the higher part in
+            // sueau.
+            o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20));
+
+            // Address calculation inside of GoB should virtually be
+            // y * 64 + x * element_size (each row is linear).
+            // So why are those bits swizzled like so?
+            // I have no idea, but these are correct even for atomics
+            // that accept real addresses.
+            o.set_bit(4, y.get_bit(16));
+            o.set_bit(5, y.get_bit(17));
+            o.set_bit(6, x.get_bit(20));
+            o.set_bit(7, y.get_bit(18));
+
+            o.set_bit(8, x.get_bit(21));
+            // 9..11: 0
+
+            // -------------- Tiles --------------
+            // Number of tiles log2
+            let ntx = x.get_bit_range_u64(27..30) & 0x1;
+            let nty = y.get_bit_range_u64(27..30);
+            let ntz = z.get_bit_range_u64(27..30);
+            let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false
+
+            // Computes how many bits to dedicate to GoB coords inside
+            // a block
+            o.set_field(12..16, ntx + nty + ntz);
+
+            // Coords in gob_space.
+            // Remove 6 bits from x and 3 bits from y, those are used
+            // as element coords in GoB space.
+            let a = x.get_bit_range_u64(22..24); // 1100_0000
+            let b = y.get_bit_range_u64(19..24); // 1111_1000
+            let c = z.get_bit_range_u64(16..24); // 1111_1111
+
+            // nt* indicates how many bits to consider (max 5)
+            let a = a & ((1 << ntx) - 1);
+            let b = b & ((1 << nty.min(5)) - 1);
+            let c = c & ((1 << ntz.min(5)) - 1);
+
+            // Compute gob offset
+            // We can just or together at certain offsets because
+            // Tiles are always powers of two in each direction.
+            // z || y || x (LSB)
+            let res = c;
+            let res = (res << nty) | b;
+            let res = (res << ntx) | a;
+            let mask = match ntx {
+                0 => 0x3ff,
+                _ => 0x7ff,
+            };
+
+            // gob coords will be put before the block coords in
+            // sueau.
+            o.set_field(16..27, res & mask);
+        } else {
+            let d = z.get_bit_range_u64(0..8);
+            let el_size_log2 = x.get_bit_range_u64(27..30);
+            o.set_field(0..8, (d << el_size_log2) & 0xff);
+            // 9..11: 0
+            o.set_field(12..15, el_size_log2);
+        }
+
+        o.set_bit(11, is_pitch_linear_2d);
+
+        let is_oob =
+            x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d);
+        f.set_u32_dst(self, &self.dst, o_raw);
+        f.set_pred_dst(self, &self.pdst, is_oob);
+    }
+}
+
+impl DisplayOp for OpSuBfm {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "subfm")?;
+
+        if self.is_3d {
+            write!(f, ".3d")?;
+        }
+
+        write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
+    }
+}
+impl_display_for_op!(OpSuBfm);
+
+/// Kepler only
+/// Used to compute the higher 32 bits of image address using
+/// the merged bitfield and the block coordinates (offset).
+/// It can switch to a pitch_linear mode (bit 11 of bit-field).
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
+pub struct OpSuEau {
+    #[dst_type(GPR)]
+    pub dst: Dst,
+
+    /// offset is computed from the block coordinates.
+    /// it's ok to add it directly to the address since they are both
+    /// "aligned" to 64 (the first 8 bits are removed from both)
+    #[src_type(GPR)]
+    pub off: Src,
+
+    ///  8.. 9: offset, last bit
+    /// 11..12: pitch_linear: when enabled the bf-offset is ignored and
+    ///         the off_shl is subtracted by 8
+    /// 12..16: off_shl, shifts left the offset by off_shl + 1
+    /// 16..27: 11-bit offset, when joined with the 1-bit offset completes the
+    ///         12-bit offset ORed to the src offset after shifting
+    ///         (unless pitch_linear)
+    #[src_type(ALU)]
+    pub bit_field: Src,
+
+    #[src_type(GPR)]
+    pub addr: Src,
+}
+
+impl Foldable for OpSuEau {
+    fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
+        let off_raw = f.get_u32_src(self, &self.off);
+        let bf_raw = f.get_u32_src(self, &self.bit_field);
+        let addr = f.get_u32_src(self, &self.addr);
+
+        let bf = BitView::new(&bf_raw);
+
+        let off1 = bf.get_bit_range_u64(8..9) as u32;
+        let is_pitch_linear = bf.get_bit(11);
+        let off_shift = bf.get_bit_range_u64(12..16) as u32;
+        let offs = bf.get_bit_range_u64(16..27) as u32;
+
+        let res = if !is_pitch_linear {
+            // Block linear
+            // off_raw are the block coordinates
+            // to those we add gob coordinates from the merged bitfield
+            // and the MSB of in-gob coordinates.
+            let omul = off_shift + 1;
+            let real_off = (off_raw << omul) | (offs << 1) | off1;
+            addr.wrapping_add(real_off & 0x7ff_ffff)
+        } else {
+            // Add the high part of the coordinates to addr
+            // off << (omul - 8)
+            // but for negative values do a shr instead.
+            // In fact, off_shift will always be < 8 because pitch_linear
+            // subfm only assigns bits 12..15, so this is always a shr
+            let shl_amount = off_shift as i32 - 8;
+            let off = if shl_amount < 0 {
+                off_raw >> (-shl_amount as u32)
+            } else {
+                off_raw << (shl_amount as u32)
+            };
+            addr.wrapping_add(off & 0xff_ffff)
+        };
+        f.set_u32_dst(self, &self.dst, res);
+    }
+}
+
+impl DisplayOp for OpSuEau {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr}
+    }
+}
+impl_display_for_op!(OpSuEau);
+
+#[derive(Copy, Clone, Debug)]
+pub enum IMadSpSrcType {
+    U32,
+    U24,
+    U16Hi,
+    U16Lo,
+    S32,
+    S24,
+    S16Hi,
+    S16Lo,
+}
+
+impl IMadSpSrcType {
+    pub fn unsigned(self) -> IMadSpSrcType {
+        use IMadSpSrcType::*;
+        match self {
+            S32 => U32,
+            S24 => U24,
+            S16Hi => U16Hi,
+            S16Lo => U16Lo,
+            x => x,
+        }
+    }
+
+    #[allow(dead_code)] // Used in hw_tests
+    pub fn with_sign(self, sign: bool) -> Self {
+        use IMadSpSrcType::*;
+        if !sign {
+            return self.unsigned();
+        }
+        match self {
+            U32 => S32,
+            U24 => S24,
+            U16Hi => S16Hi,
+            U16Lo => S16Lo,
+            x => x,
+        }
+    }
+
+    pub fn sign(self) -> bool {
+        use IMadSpSrcType::*;
+        match self {
+            U32 | U24 | U16Hi | U16Lo => false,
+            S32 | S24 | S16Hi | S16Lo => true,
+        }
+    }
+
+    fn cast(&self, v: u32) -> i64 {
+        use IMadSpSrcType::*;
+        match self {
+            U32 => v as i64,
+            U24 => (v & 0x00ff_ffff) as i64,
+            U16Lo => (v as u16) as i64,
+            U16Hi => (v >> 16) as i64,
+            S32 => (v as i32) as i64,
+            S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend
+            S16Lo => (v as i16) as i64,
+            S16Hi => ((v >> 16) as i16) as i64,
+        }
+    }
+}
+
+impl fmt::Display for IMadSpSrcType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let sign = if self.sign() { ".s" } else { ".u" };
+        let width = match self.unsigned() {
+            IMadSpSrcType::U32 => "32",
+            IMadSpSrcType::U24 => "24",
+            IMadSpSrcType::U16Lo => "16h0",
+            IMadSpSrcType::U16Hi => "16h1",
+            _ => unreachable!(),
+        };
+        write!(f, "{}{}", sign, width)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum IMadSpMode {
+    Explicit([IMadSpSrcType; 3]),
+    // Parameters are loaded from src1 bits 26..32
+    FromSrc1,
+}
+
+impl fmt::Display for IMadSpMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"),
+            IMadSpMode::FromSrc1 => write!(f, ".sd"),
+        }
+    }
+}
+
+/// Kepler only
+/// Extracted Integer Multiply and Add.
+/// It does the same operation as an imad op, but it can extract the
+/// sources from a subset of the register (only 32, 24 or 16 bits).
+/// It can also do a "load parameters" mode where the modifiers are
+/// loaded from the higher bits in src2 (check Foldable impl for details).
+/// Limits: src1 can never be U32 or U16Hi,
+///         src2 can never be U16Hi
+///         src2 signedness is tied to src1 and src0 signedness,
+///           if either is signed, src2 must be signed too.
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
+pub struct OpIMadSp {
+    #[dst_type(GPR)]
+    pub dst: Dst,
+
+    #[src_type(ALU)]
+    pub srcs: [Src; 3],
+
+    pub mode: IMadSpMode,
+}
+
+impl Foldable for OpIMadSp {
+    fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
+        let src0 = f.get_u32_src(self, &self.srcs[0]);
+        let src1 = f.get_u32_src(self, &self.srcs[1]);
+        let src2 = f.get_u32_src(self, &self.srcs[2]);
+
+        let (src_type0, src_type1, src_type2) = match self.mode {
+            IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2),
+            IMadSpMode::FromSrc1 => {
+                let params = BitView::new(&src1);
+
+                let st2 = params.get_bit_range_u64(26..28) as usize;
+                let st1 = params.get_bit_range_u64(28..30) as usize;
+                let st0 = params.get_bit_range_u64(30..32) as usize;
+
+                use IMadSpSrcType::*;
+                let types0 = [U32, U24, U16Lo, U16Hi];
+                let types1 = [U16Lo, U24, U16Lo, U24];
+                let types2 = [U32, U24, U16Lo, U32];
+
+                (
+                    types0[st0].unsigned(),
+                    types1[st1].unsigned(),
+                    types2[st2].unsigned(),
+                )
+            }
+        };
+
+        let src0 = src_type0.cast(src0);
+        let src1 = src_type1.cast(src1);
+        let src2 = src_type2.cast(src2);
+
+        f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32);
+    }
+}
+
+impl DisplayOp for OpIMadSp {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "imadsp{} {} {} {}",
+            self.mode, self.srcs[0], self.srcs[1], self.srcs[2]
+        )
+    }
+}
+impl_display_for_op!(OpIMadSp);
+
+/// In SuGa ops, the address is always specified in two parts, the higher
+/// part contains the base address without the lower 8 bits (base_addr >> 8),
+/// while the lower part might contain either the missing 8 bits (U8) or
+/// a full 32-bit offset that must not be shifted (U32).
+///
+/// In short:
+/// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF)
+/// U32: real_address = (addr_hi << 8) + addr_lo
+/// The signed variants do the same but with sign extension probably
+#[derive(Clone, Copy)]
+pub enum SuGaOffsetMode {
+    U32,
+    S32,
+    U8,
+    S8,
+}
+
+/// Kepler only
+/// Load a pixel from an image, takes the pixel address and format as an
+/// argument. Since the image coordinates are not present, the instruction
+/// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1)
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice)]
+pub struct OpSuLdGa {
+    pub dst: Dst,
+
+    pub mem_type: MemType,
+    pub offset_mode: SuGaOffsetMode,
+
+    /// Format for the loaded data, passed directly from the descriptor.
+    #[src_type(GPR)]
+    pub format: Src,
+
+    /// This is not an address, but it's two registers that contain
+    /// [addr >> 8, addr & 0xff].
+    /// This works because addr >> 8 is 32-bits (GOB-aligned) and the
+    /// rest 8-bits are extracted by the bit-field
+    /// It's useful since in block-linear mode the lower bits and the higher
+    /// bits are computed in different ways.
+    #[src_type(SSA)]
+    pub addr: Src,
+
+    #[src_type(Pred)]
+    pub out_of_bounds: Src,
+}
+
+impl DisplayOp for OpSuLdGa {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "suldga{} [{}] {} {}",
+            self.mem_type, self.addr, self.format, self.out_of_bounds
+        )
+    }
+}
+impl_display_for_op!(OpSuLdGa);
+
+/// Kepler only
+/// Store a pixel in an image, takes the pixel address and format as an
+/// argument. Since the image coordinates are not present, the instruction
+/// also needs an `out_of_bounds` predicate, when true, stores are ingored
+#[repr(C)]
+#[derive(SrcsAsSlice, DstsAsSlice)]
+pub struct OpSuStGa {
+    pub image_access: ImageAccess,
+    pub offset_mode: SuGaOffsetMode,
+
+    #[src_type(GPR)]
+    pub format: Src,
+
+    #[src_type(SSA)]
+    pub addr: Src,
+
+    #[src_type(SSA)]
+    pub data: Src,
+
+    #[src_type(Pred)]
+    pub out_of_bounds: Src,
+}
+
+impl DisplayOp for OpSuStGa {
+    fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "sustga{} [{}] {} {} {}",
+            self.image_access,
+            self.addr,
+            self.format,
+            self.data,
+            self.out_of_bounds,
+        )
+    }
+}
+impl_display_for_op!(OpSuStGa);
+
 #[repr(C)]
 #[derive(SrcsAsSlice, DstsAsSlice)]
 pub struct OpLd {
@ -6676,6 +7339,12 @@ pub enum Op {
    SuLd(OpSuLd),
    SuSt(OpSuSt),
    SuAtom(OpSuAtom),
+    SuClamp(OpSuClamp),
+    SuBfm(OpSuBfm),
+    SuEau(OpSuEau),
+    IMadSp(OpIMadSp),
+    SuLdGa(OpSuLdGa),
+    SuStGa(OpSuStGa),
    Ld(OpLd),
    Ldc(OpLdc),
    LdSharedLock(OpLdSharedLock),
@ -6805,6 +7474,10 @@ impl Op {
            | Op::LeaX(_)
            | Op::Lop2(_)
            | Op::Lop3(_)
+            | Op::SuClamp(_)
+            | Op::SuBfm(_)
+            | Op::SuEau(_)
+            | Op::IMadSp(_)
            | Op::Shf(_)
            | Op::Shl(_)
            | Op::Shr(_)
@ -6834,7 +7507,11 @@ impl Op {
            | Op::Txq(_) => false,

            // Surface ops
-            Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => false,
+            Op::SuLd(_)
+            | Op::SuSt(_)
+            | Op::SuAtom(_)
+            | Op::SuLdGa(_)
+            | Op::SuStGa(_) => false,

            // Memory ops
            Op::Ld(_)
@ -7232,7 +7909,11 @@ impl Instr {
            Op::Atom(op) => op.mem_space != MemSpace::Local,
            Op::Ld(op) => op.access.space != MemSpace::Local,
            Op::St(op) => op.access.space != MemSpace::Local,
-            Op::SuAtom(_) | Op::SuLd(_) | Op::SuSt(_) => true,
+            Op::SuAtom(_)
+            | Op::SuLd(_)
+            | Op::SuSt(_)
+            | Op::SuLdGa(_)
+            | Op::SuStGa(_) => true,
            _ => false,
        }
    }
@ -7241,7 +7922,7 @@ impl Instr {
        match &self.op {
            Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)),
            Op::St(op) => matches!(op.access.space, MemSpace::Global(_)),
-            Op::SuAtom(_) | Op::SuSt(_) => true,
+            Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true,
            _ => false,
        }
    }
@ -7250,6 +7931,7 @@ impl Instr {
        match &self.op {
            Op::ASt(_)
            | Op::SuSt(_)
+            | Op::SuStGa(_)
            | Op::SuAtom(_)
            | Op::LdSharedLock(_)
            | Op::St(_)
--- a/src/nouveau/compiler/nak/opt_instr_sched_common.rs
+++ b/src/nouveau/compiler/nak/opt_instr_sched_common.rs
@ -129,6 +129,10 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
        | Op::LeaX(_)
        | Op::Lop2(_)
        | Op::Lop3(_)
+        | Op::SuClamp(_)
+        | Op::SuBfm(_)
+        | Op::SuEau(_)
+        | Op::IMadSp(_)
        | Op::Shf(_)
        | Op::Shl(_)
        | Op::Shr(_)
@ -158,7 +162,11 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
        | Op::Txq(_) => SideEffect::Memory,

        // Surface ops
-        Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory,
+        Op::SuLd(_)
+        | Op::SuSt(_)
+        | Op::SuAtom(_)
+        | Op::SuLdGa(_)
+        | Op::SuStGa(_) => SideEffect::Memory,

        // Memory ops
        Op::Ipa(_) | Op::Ldc(_) => SideEffect::None,
@ -262,7 +270,11 @@ pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 {
        | Op::Txq(_) => 32,

        // Surface ops
-        Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32,
+        Op::SuLd(_)
+        | Op::SuSt(_)
+        | Op::SuAtom(_)
+        | Op::SuLdGa(_)
+        | Op::SuStGa(_) => 32,

        // Memory ops
        Op::Ldc(_) => 4,