diff --git a/src/nouveau/nil/copy.rs b/src/nouveau/nil/copy.rs
new file mode 100644
index 00000000000..804d35ddfce
--- /dev/null
+++ b/src/nouveau/nil/copy.rs
@@ -0,0 +1,594 @@
+// Copyright © 2024 Valve Corp. and Collabora, Ltd.
+// SPDX-License-Identifier: MIT
+
+use crate::extent::{units, Extent4D, Offset4D};
+use crate::tiling::Tiling;
+
+use std::ffi::c_void;
+use std::ops::Range;
+
+// This file is dedicated to the internal tiling layout, mainly in the context
+// of CPU-based tiled memcpy implementations (and helpers) for VK_EXT_host_image_copy
+//
+// Work here is based on isl_tiled_memcpy, fd6_tiled_memcpy, old work by Rebecca Mckeever,
+// and https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/
+//
+// On NVIDIA, the tiling system is a two-tier one, and images are first tiled in
+// a grid of rows of tiles (called "Blocks") with one or more columns:
+//
+// +----------+----------+----------+----------+
+// | Block 0  | Block 1  | Block 2  | Block 3  |
+// +----------+----------+----------+----------+
+// | Block 4  | Block 5  | Block 6  | Block 7  |
+// +----------+----------+----------+----------+
+// | Block 8  | Block 9  | Block 10 | Block 11 |
+// +----------+----------+----------+----------+
+//
+// The blocks themselves are ordered linearly as can be seen above, which is
+// where the "Block Linear" naming comes from for NVIDIA's tiling scheme.
+//
+// For 3D images, each block continues in the Z direction such that tiles
+// contain multiple Z slices. If the image depth is longer than the
+// block depth, there will be more than one layer of blocks, where a layer is
+// made up of 1 or more Z slices. For example, if the above tile pattern was
+// the first layer of a multilayer arrangement, the second layer would be:
+//
+// +----------+----------+----------+----------+
+// | Block 12 | Block 13 | Block 14 | Block 15 |
+// +----------+----------+----------+----------+
+// | Block 16 | Block 17 | Block 18 | Block 19 |
+// +----------+----------+----------+----------+
+// | Block 20 | Block 21 | Block 22 | Block 23 |
+// +----------+----------+----------+----------+
+//
+// The number of rows, columns, and layers of tiles can thus be deduced to be:
+//    rows    >= ceiling(image_height / block_height)
+//    columns >= ceiling(image_width  / block_width)
+//    layers  >= ceiling(image_depth  / block_depth)
+//
+// Where block_width is a constant 64B (unless for sparse) and block_height
+// can be either 8 or 16 GOBs tall (more on GOBs below). For us, block_depth
+// is one for now.
+//
+// The >= is in case the blocks around the edges are partial.
+//
+// Now comes the second tier. Each block is composed of GOBs (Groups of Bytes)
+// arranged in ascending order in a single column:
+//
+// +---------------------------+
+// |           GOB 0           |
+// +---------------------------+
+// |           GOB 1           |
+// +---------------------------+
+// |           GOB 2           |
+// +---------------------------+
+// |           GOB 3           |
+// +---------------------------+
+//
+// The number of GOBs in a full block is
+//    block_height * block_depth
+//
+// An Ampere GOB is 512 bytes, arranged in a 64x8 layout and split into Sectors.
+// Each Sector is 32 Bytes, and the Sectors in a GOB are arranged in a 16x2
+// layout (i.e., two 16B lines on top of each other). It's then arranged into
+// two columns that are 2 sectors by 4, leading to a 4x4 grid of sectors:
+//
+// +----------+----------+----------+----------+
+// | Sector 0 | Sector 1 | Sector 0 | Sector 1 |
+// +----------+----------+----------+----------+
+// | Sector 2 | Sector 3 | Sector 2 | Sector 3 |
+// +----------+----------+----------+----------+
+// | Sector 4 | Sector 5 | Sector 4 | Sector 5 |
+// +----------+----------+----------+----------+
+// | Sector 6 | Sector 7 | Sector 6 | Sector 7 |
+// +----------+----------+----------+----------+
+//
+// From the given pixel address equations in the Orin manual, we arrived at
+// the following bit interleave pattern for the pixel address:
+//
+//      b8 b7 b6 b5 b4 b3 b2 b1 b0
+//      --------------------------
+//      x5 y2 y1 x4 y0 x3 x2 x1 x0
+//
+// Which would look something like this:
+// fn get_pixel_offset(
+//      x: usize,
+//      y: usize,
+//  ) -> usize {
+//      (x & 15)       |
+//      (y & 1)  << 4  |
+//      (x & 16) << 1  |
+//      (y & 2)  << 5  |
+//      (x & 32) << 3
+//  }
+//
+//
+
+// The way our implementation will work is by splitting an image into tiles, then
+// each tile will be broken into its GOBs, and finally each GOB into sectors,
+// where each sector will be copied into its position.
+//
+// For code sharing and cleanliness, we write everything to be very generic,
+// so as to be shared between Linear <-> Tiled and Tiled <-> Linear paths, and
+// (ab)use Rust's traits to specialize the last level (copy_gob/copy_whole_gob)
+// for a particular direction.
+//
+// The copy_x and copy_whole_x distinction is made because if we can guarantee
+// that tiles/gobs are whole and aligned, we can skip all bounds checking and
+// copy things in fast and tight loops
+
+/// Copies a GOB
+///
+/// This trait should be implemented twice for each GOB type, once for
+/// tiled-to-linear and once for linear-to-tiled.  This allows to implement
+/// the rest of tiled copies in a generic way.
+trait CopyGOB {
+    const GOB_EXTENT_B: Extent4D<units::Bytes>;
+    const X_DIVISOR: u32;
+
+    unsafe fn copy_gob(
+        tiled: usize,
+        linear: LinearPointer,
+        start: Offset4D<units::Bytes>,
+        end: Offset4D<units::Bytes>,
+    );
+
+    // No bounding box for this one
+    unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) {
+        Self::copy_gob(
+            tiled,
+            linear,
+            Offset4D::new(0, 0, 0, 0),
+            Offset4D::new(0, 0, 0, 0) + Self::GOB_EXTENT_B,
+        );
+    }
+}
+
+/// Copies at most 16B of data to/from linear
+trait Copy16B {
+    const X_DIVISOR: u32;
+
+    unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize);
+    unsafe fn copy_16b(tiled: *mut [u8; 16], linear: *mut [u8; 16]) {
+        Self::copy(tiled as *mut _, linear as *mut _, 16);
+    }
+}
+
+struct CopyGOBTuring2D<C: Copy16B> {
+    phantom: std::marker::PhantomData<C>,
+}
+
+impl<C: Copy16B> CopyGOBTuring2D<C> {
+    fn for_each_16b(mut f: impl FnMut(u32, u32, u32)) {
+        for i in 0..2 {
+            f(i * 0x100 + 0x00, i * 32 + 0, 0);
+            f(i * 0x100 + 0x10, i * 32 + 0, 1);
+            f(i * 0x100 + 0x20, i * 32 + 0, 2);
+            f(i * 0x100 + 0x30, i * 32 + 0, 3);
+
+            f(i * 0x100 + 0x40, i * 32 + 16, 0);
+            f(i * 0x100 + 0x50, i * 32 + 16, 1);
+            f(i * 0x100 + 0x60, i * 32 + 16, 2);
+            f(i * 0x100 + 0x70, i * 32 + 16, 3);
+
+            f(i * 0x100 + 0x80, i * 32 + 0, 4);
+            f(i * 0x100 + 0x90, i * 32 + 0, 5);
+            f(i * 0x100 + 0xa0, i * 32 + 0, 6);
+            f(i * 0x100 + 0xb0, i * 32 + 0, 7);
+
+            f(i * 0x100 + 0xc0, i * 32 + 16, 4);
+            f(i * 0x100 + 0xd0, i * 32 + 16, 5);
+            f(i * 0x100 + 0xe0, i * 32 + 16, 6);
+            f(i * 0x100 + 0xf0, i * 32 + 16, 7);
+        }
+    }
+}
+
+impl<C: Copy16B> CopyGOB for CopyGOBTuring2D<C> {
+    const GOB_EXTENT_B: Extent4D<units::Bytes> = Extent4D::new(64, 8, 1, 1);
+    const X_DIVISOR: u32 = C::X_DIVISOR;
+
+    unsafe fn copy_gob(
+        tiled: usize,
+        linear: LinearPointer,
+        start: Offset4D<units::Bytes>,
+        end: Offset4D<units::Bytes>,
+    ) {
+        Self::for_each_16b(|offset, x, y| {
+            if y >= start.y && y < end.y {
+                let tiled = tiled + (offset as usize);
+                let linear = linear.at(Offset4D::new(x, y, 0, 0));
+                if x >= start.x && x + 16 <= end.x {
+                    C::copy_16b(tiled as *mut _, linear as *mut _);
+                } else if x + 16 >= start.x && x < end.x {
+                    let start = (std::cmp::max(x, start.x) - x) as usize;
+                    let end = std::cmp::min(end.x - x, 16) as usize;
+                    C::copy(
+                        (tiled + start) as *mut _,
+                        (linear + start) as *mut _,
+                        end - start,
+                    );
+                }
+            }
+        });
+    }
+
+    unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) {
+        Self::for_each_16b(|offset, x, y| {
+            let tiled = tiled + (offset as usize);
+            let linear = linear.at(Offset4D::new(x, y, 0, 0));
+            C::copy_16b(tiled as *mut _, linear as *mut _);
+        });
+    }
+}
+
+fn aligned_range(start: u32, end: u32, align: u32) -> Range<u32> {
+    debug_assert!(align.is_power_of_two());
+    let align_1 = align - 1;
+    (start & !align_1)..((end + align_1) & !align_1)
+}
+
+fn chunk_range(
+    whole: Range<u32>,
+    chunk_start: u32,
+    chunk_len: u32,
+) -> Range<u32> {
+    debug_assert!(chunk_start < whole.end);
+    let start = if chunk_start < whole.start {
+        whole.start - chunk_start
+    } else {
+        0
+    };
+    let end = std::cmp::min(whole.end - chunk_start, chunk_len);
+    start..end
+}
+
+fn for_each_extent4d<U>(
+    start: Offset4D<U>,
+    end: Offset4D<U>,
+    chunk: Extent4D<U>,
+    mut f: impl FnMut(Offset4D<U>, Offset4D<U>, Offset4D<U>),
+) {
+    debug_assert!(chunk.width.is_power_of_two());
+    debug_assert!(chunk.height.is_power_of_two());
+    debug_assert!(chunk.depth.is_power_of_two());
+    debug_assert!(chunk.array_len == 1);
+
+    debug_assert!(start.a == 0);
+    debug_assert!(end.a == 1);
+
+    let x_range = aligned_range(start.x, end.x, chunk.width);
+    let y_range = aligned_range(start.y, end.y, chunk.height);
+    let z_range = aligned_range(start.z, end.z, chunk.depth);
+
+    for z in z_range.step_by(chunk.depth as usize) {
+        let chunk_z = chunk_range(start.z..end.z, z, chunk.depth);
+        for y in y_range.clone().step_by(chunk.height as usize) {
+            let chunk_y = chunk_range(start.y..end.y, y, chunk.height);
+            for x in x_range.clone().step_by(chunk.width as usize) {
+                let chunk_x = chunk_range(start.x..end.x, x, chunk.width);
+                let chunk_start = Offset4D::new(x, y, z, start.a);
+                let start = Offset4D::new(
+                    chunk_x.start,
+                    chunk_y.start,
+                    chunk_z.start,
+                    start.a,
+                );
+                let end =
+                    Offset4D::new(chunk_x.end, chunk_y.end, chunk_z.end, end.a);
+                f(chunk_start, start, end);
+            }
+        }
+    }
+}
+
+fn for_each_extent4d_aligned<U>(
+    start: Offset4D<U>,
+    end: Offset4D<U>,
+    chunk: Extent4D<U>,
+    mut f: impl FnMut(Offset4D<U>),
+) {
+    debug_assert!(start.x % chunk.width == 0);
+    debug_assert!(start.y % chunk.height == 0);
+    debug_assert!(start.z % chunk.depth == 0);
+    debug_assert!(start.a == 0);
+
+    debug_assert!(end.x % chunk.width == 0);
+    debug_assert!(end.y % chunk.height == 0);
+    debug_assert!(end.z % chunk.depth == 0);
+    debug_assert!(end.a == 1);
+
+    debug_assert!(chunk.width.is_power_of_two());
+    debug_assert!(chunk.height.is_power_of_two());
+    debug_assert!(chunk.depth.is_power_of_two());
+    debug_assert!(chunk.array_len == 1);
+
+    for z in (start.z..end.z).step_by(chunk.depth as usize) {
+        for y in (start.y..end.y).step_by(chunk.height as usize) {
+            for x in (start.x..end.x).step_by(chunk.width as usize) {
+                f(Offset4D::new(x, y, z, start.a));
+            }
+        }
+    }
+}
+
+struct BlockPointer {
+    pointer: usize,
+    x_mul: usize,
+    y_mul: usize,
+    z_mul: usize,
+    #[cfg(debug_assertions)]
+    bl_extent: Extent4D<units::Bytes>,
+}
+
+impl BlockPointer {
+    fn new(
+        pointer: usize,
+        bl_extent: Extent4D<units::Bytes>,
+        extent: Extent4D<units::Bytes>,
+    ) -> BlockPointer {
+        debug_assert!(bl_extent.array_len == 1);
+
+        debug_assert!(extent.width % bl_extent.width == 0);
+        debug_assert!(extent.height % bl_extent.height == 0);
+        debug_assert!(extent.depth % bl_extent.depth == 0);
+        debug_assert!(extent.array_len == 1);
+
+        BlockPointer {
+            pointer,
+            // We assume that offsets passed to at() are aligned to bl_extent so
+            //
+            //    x_bl * bl_size_B
+            //  = (x / bl_extent.width) * bl_size_B
+            //  = x * (bl_size_B / bl_extent.width)
+            //  = x * bl_extent.height * bl_extent.depth
+            x_mul: (bl_extent.height as usize) * (bl_extent.depth as usize),
+
+            //   y_bl * width_bl * bl_size_B
+            //   (y / bl_extent.height) * width_bl * bl_size_B
+            // = y * (bl_size_B / bl_extent.height) * width_bl
+            // = y * bl_extent.width * bl_extent.depth * width_bl
+            // = y * (width_bl * bl_extent.width) * bl_extent.depth
+            // = x * extent.width * bl_extent.depth
+            y_mul: (extent.width as usize) * (bl_extent.depth as usize),
+
+            //   z_bl * width_bl * height_bl * bl_size_B
+            // = (z / bl_extent.depth) * width_bl * height_bl * bl_size_B
+            // = z * (bl_size_B / bl_extent.depth) * width_bl * height_bl
+            // = z * (bl_extent.width * bl_extent.height) * width_bl * height_bl
+            // = z * width_bl * bl_extent.width * height_bl * bl_extent.height
+            // = z * extent.width * extent.height
+            z_mul: (extent.width as usize) * (extent.height as usize),
+
+            #[cfg(debug_assertions)]
+            bl_extent,
+        }
+    }
+
+    #[inline]
+    fn at(&self, offset: Offset4D<units::Bytes>) -> usize {
+        #[cfg(debug_assertions)]
+        {
+            debug_assert!(offset.x % self.bl_extent.width == 0);
+            debug_assert!(offset.y % self.bl_extent.height == 0);
+            debug_assert!(offset.z % self.bl_extent.depth == 0);
+            debug_assert!(offset.a == 0);
+        }
+
+        self.pointer
+            + (offset.z as usize) * self.z_mul
+            + (offset.y as usize) * self.y_mul
+            + (offset.x as usize) * self.x_mul
+    }
+}
+
+#[derive(Copy, Clone)]
+struct LinearPointer {
+    pointer: usize,
+    x_shift: u32,
+    row_stride_B: usize,
+    plane_stride_B: usize,
+}
+
+impl LinearPointer {
+    fn new(
+        pointer: usize,
+        x_divisor: u32,
+        row_stride_B: usize,
+        plane_stride_B: usize,
+    ) -> LinearPointer {
+        debug_assert!(x_divisor.is_power_of_two());
+        LinearPointer {
+            pointer,
+            x_shift: x_divisor.ilog2(),
+            row_stride_B,
+            plane_stride_B,
+        }
+    }
+
+    fn x_divisor(&self) -> u32 {
+        1 << self.x_shift
+    }
+
+    #[inline]
+    fn reverse(self, offset: Offset4D<units::Bytes>) -> LinearPointer {
+        debug_assert!(offset.x % (1 << self.x_shift) == 0);
+        debug_assert!(offset.a == 0);
+        LinearPointer {
+            pointer: self
+                .pointer
+                .wrapping_sub((offset.z as usize) * self.plane_stride_B)
+                .wrapping_sub((offset.y as usize) * self.row_stride_B)
+                .wrapping_sub((offset.x >> self.x_shift) as usize),
+            x_shift: self.x_shift,
+            row_stride_B: self.row_stride_B,
+            plane_stride_B: self.plane_stride_B,
+        }
+    }
+
+    #[inline]
+    fn at(self, offset: Offset4D<units::Bytes>) -> usize {
+        debug_assert!(offset.x % (1 << self.x_shift) == 0);
+        debug_assert!(offset.a == 0);
+        self.pointer
+            .wrapping_add((offset.z as usize) * self.plane_stride_B)
+            .wrapping_add((offset.y as usize) * self.row_stride_B)
+            .wrapping_add((offset.x >> self.x_shift) as usize)
+    }
+
+    #[inline]
+    fn offset(self, offset: Offset4D<units::Bytes>) -> LinearPointer {
+        LinearPointer {
+            pointer: self.at(offset),
+            x_shift: self.x_shift,
+            row_stride_B: self.row_stride_B,
+            plane_stride_B: self.plane_stride_B,
+        }
+    }
+}
+
+unsafe fn copy_tile<CG: CopyGOB>(
+    tiling: Tiling,
+    tile_ptr: usize,
+    linear: LinearPointer,
+    start: Offset4D<units::Bytes>,
+    end: Offset4D<units::Bytes>,
+) {
+    debug_assert!(linear.x_divisor() == CG::X_DIVISOR);
+    debug_assert!(tiling.gob_type.extent_B() == CG::GOB_EXTENT_B);
+
+    let tile_extent_B = tiling.extent_B();
+    let tile_ptr = BlockPointer::new(tile_ptr, CG::GOB_EXTENT_B, tile_extent_B);
+
+    if start.is_aligned_to(CG::GOB_EXTENT_B)
+        && end.is_aligned_to(CG::GOB_EXTENT_B)
+    {
+        for_each_extent4d_aligned(start, end, CG::GOB_EXTENT_B, |gob| {
+            CG::copy_whole_gob(tile_ptr.at(gob), linear.offset(gob));
+        });
+    } else {
+        for_each_extent4d(start, end, CG::GOB_EXTENT_B, |gob, start, end| {
+            let tiled = tile_ptr.at(gob);
+            let linear = linear.offset(gob);
+            if start == Offset4D::new(0, 0, 0, 0)
+                && end == Offset4D::new(0, 0, 0, 0) + CG::GOB_EXTENT_B
+            {
+                CG::copy_whole_gob(tiled, linear);
+            } else {
+                CG::copy_gob(tiled, linear, start, end);
+            }
+        });
+    }
+}
+
+unsafe fn copy_tiled<CG: CopyGOB>(
+    tiling: Tiling,
+    level_extent_B: Extent4D<units::Bytes>,
+    level_tiled_ptr: usize,
+    linear: LinearPointer,
+    start: Offset4D<units::Bytes>,
+    end: Offset4D<units::Bytes>,
+) {
+    let tile_extent_B = tiling.extent_B();
+    let level_extent_B = level_extent_B.align(&tile_extent_B);
+
+    // Back up the linear pointer so it also points at the start of the level.
+    // This way, every step of the iteration can assume that both pointers
+    // point to the start chunk of the level, tile, or GOB.
+    let linear = linear.reverse(start);
+
+    let level_tiled_ptr =
+        BlockPointer::new(level_tiled_ptr, tile_extent_B, level_extent_B);
+
+    for_each_extent4d(start, end, tile_extent_B, |tile, start, end| {
+        let tile_ptr = level_tiled_ptr.at(tile);
+        let linear = linear.offset(tile);
+        copy_tile::<CG>(tiling, tile_ptr, linear, start, end);
+    });
+}
+
+struct RawCopyToTiled {}
+
+impl Copy16B for RawCopyToTiled {
+    const X_DIVISOR: u32 = 1;
+
+    unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) {
+        // This is backwards from memcpy
+        std::ptr::copy_nonoverlapping(linear, tiled, bytes);
+    }
+}
+
+struct RawCopyToLinear {}
+
+impl Copy16B for RawCopyToLinear {
+    const X_DIVISOR: u32 = 1;
+
+    unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) {
+        // This is backwards from memcpy
+        std::ptr::copy_nonoverlapping(tiled, linear, bytes);
+    }
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nil_copy_linear_to_tiled(
+    tiled_dst: *mut c_void,
+    level_extent_B: Extent4D<units::Bytes>,
+    linear_src: *const c_void,
+    linear_row_stride_B: usize,
+    linear_plane_stride_B: usize,
+    offset_B: Offset4D<units::Bytes>,
+    extent_B: Extent4D<units::Bytes>,
+    tiling: &Tiling,
+) {
+    let end_B = offset_B + extent_B;
+
+    let linear_src = linear_src as usize;
+    let tiled_dst = tiled_dst as usize;
+    let linear_pointer = LinearPointer::new(
+        linear_src,
+        1,
+        linear_row_stride_B,
+        linear_plane_stride_B,
+    );
+
+    copy_tiled::<CopyGOBTuring2D<RawCopyToTiled>>(
+        *tiling,
+        level_extent_B,
+        tiled_dst,
+        linear_pointer,
+        offset_B,
+        end_B,
+    );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nil_copy_tiled_to_linear(
+    linear_dst: *mut c_void,
+    linear_row_stride_B: usize,
+    linear_plane_stride_B: usize,
+    tiled_src: *const c_void,
+    level_extent_B: Extent4D<units::Bytes>,
+    offset_B: Offset4D<units::Bytes>,
+    extent_B: Extent4D<units::Bytes>,
+    tiling: &Tiling,
+) {
+    let mut end_B = offset_B + extent_B;
+    end_B.a = 1;
+    let linear_dst = linear_dst as usize;
+    let tiled_src = tiled_src as usize;
+    let linear_pointer = LinearPointer::new(
+        linear_dst,
+        1,
+        linear_row_stride_B,
+        linear_plane_stride_B,
+    );
+
+    copy_tiled::<CopyGOBTuring2D<RawCopyToLinear>>(
+        *tiling,
+        level_extent_B,
+        tiled_src,
+        linear_pointer,
+        offset_B,
+        end_B,
+    );
+}
diff --git a/src/nouveau/nil/lib.rs b/src/nouveau/nil/lib.rs
index cd62f79e9a6..58c8628f145 100644
--- a/src/nouveau/nil/lib.rs
+++ b/src/nouveau/nil/lib.rs
@@ -4,6 +4,7 @@
 extern crate nil_rs_bindings;
 extern crate nvidia_headers;
 
+mod copy;
 mod extent;
 mod format;
 mod image;