diff --git a/src/nouveau/nil/copy.rs b/src/nouveau/nil/copy.rs new file mode 100644 index 00000000000..804d35ddfce --- /dev/null +++ b/src/nouveau/nil/copy.rs @@ -0,0 +1,594 @@ +// Copyright © 2024 Valve Corp. and Collabora, Ltd. +// SPDX-License-Identifier: MIT + +use crate::extent::{units, Extent4D, Offset4D}; +use crate::tiling::Tiling; + +use std::ffi::c_void; +use std::ops::Range; + +// This file is dedicated to the internal tiling layout, mainly in the context +// of CPU-based tiled memcpy implementations (and helpers) for VK_EXT_host_image_copy +// +// Work here is based on isl_tiled_memcpy, fd6_tiled_memcpy, old work by Rebecca Mckeever, +// and https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/ +// +// On NVIDIA, the tiling system is a two-tier one, and images are first tiled in +// a grid of rows of tiles (called "Blocks") with one or more columns: +// +// +----------+----------+----------+----------+ +// | Block 0 | Block 1 | Block 2 | Block 3 | +// +----------+----------+----------+----------+ +// | Block 4 | Block 5 | Block 6 | Block 7 | +// +----------+----------+----------+----------+ +// | Block 8 | Block 9 | Block 10 | Block 11 | +// +----------+----------+----------+----------+ +// +// The blocks themselves are ordered linearly as can be seen above, which is +// where the "Block Linear" naming comes from for NVIDIA's tiling scheme. +// +// For 3D images, each block continues in the Z direction such that tiles +// contain multiple Z slices. If the image depth is longer than the +// block depth, there will be more than one layer of blocks, where a layer is +// made up of 1 or more Z slices. For example, if the above tile pattern was +// the first layer of a multilayer arrangement, the second layer would be: +// +// +----------+----------+----------+----------+ +// | Block 12 | Block 13 | Block 14 | Block 15 | +// +----------+----------+----------+----------+ +// | Block 16 | Block 17 | Block 18 | Block 19 | +// +----------+----------+----------+----------+ +// | Block 20 | Block 21 | Block 22 | Block 23 | +// +----------+----------+----------+----------+ +// +// The number of rows, columns, and layers of tiles can thus be deduced to be: +// rows >= ceiling(image_height / block_height) +// columns >= ceiling(image_width / block_width) +// layers >= ceiling(image_depth / block_depth) +// +// Where block_width is a constant 64B (unless for sparse) and block_height +// can be either 8 or 16 GOBs tall (more on GOBs below). For us, block_depth +// is one for now. +// +// The >= is in case the blocks around the edges are partial. +// +// Now comes the second tier. Each block is composed of GOBs (Groups of Bytes) +// arranged in ascending order in a single column: +// +// +---------------------------+ +// | GOB 0 | +// +---------------------------+ +// | GOB 1 | +// +---------------------------+ +// | GOB 2 | +// +---------------------------+ +// | GOB 3 | +// +---------------------------+ +// +// The number of GOBs in a full block is +// block_height * block_depth +// +// An Ampere GOB is 512 bytes, arranged in a 64x8 layout and split into Sectors. +// Each Sector is 32 Bytes, and the Sectors in a GOB are arranged in a 16x2 +// layout (i.e., two 16B lines on top of each other). It's then arranged into +// two columns that are 2 sectors by 4, leading to a 4x4 grid of sectors: +// +// +----------+----------+----------+----------+ +// | Sector 0 | Sector 1 | Sector 0 | Sector 1 | +// +----------+----------+----------+----------+ +// | Sector 2 | Sector 3 | Sector 2 | Sector 3 | +// +----------+----------+----------+----------+ +// | Sector 4 | Sector 5 | Sector 4 | Sector 5 | +// +----------+----------+----------+----------+ +// | Sector 6 | Sector 7 | Sector 6 | Sector 7 | +// +----------+----------+----------+----------+ +// +// From the given pixel address equations in the Orin manual, we arrived at +// the following bit interleave pattern for the pixel address: +// +// b8 b7 b6 b5 b4 b3 b2 b1 b0 +// -------------------------- +// x5 y2 y1 x4 y0 x3 x2 x1 x0 +// +// Which would look something like this: +// fn get_pixel_offset( +// x: usize, +// y: usize, +// ) -> usize { +// (x & 15) | +// (y & 1) << 4 | +// (x & 16) << 1 | +// (y & 2) << 5 | +// (x & 32) << 3 +// } +// +// + +// The way our implementation will work is by splitting an image into tiles, then +// each tile will be broken into its GOBs, and finally each GOB into sectors, +// where each sector will be copied into its position. +// +// For code sharing and cleanliness, we write everything to be very generic, +// so as to be shared between Linear <-> Tiled and Tiled <-> Linear paths, and +// (ab)use Rust's traits to specialize the last level (copy_gob/copy_whole_gob) +// for a particular direction. +// +// The copy_x and copy_whole_x distinction is made because if we can guarantee +// that tiles/gobs are whole and aligned, we can skip all bounds checking and +// copy things in fast and tight loops + +/// Copies a GOB +/// +/// This trait should be implemented twice for each GOB type, once for +/// tiled-to-linear and once for linear-to-tiled. This allows to implement +/// the rest of tiled copies in a generic way. +trait CopyGOB { + const GOB_EXTENT_B: Extent4D; + const X_DIVISOR: u32; + + unsafe fn copy_gob( + tiled: usize, + linear: LinearPointer, + start: Offset4D, + end: Offset4D, + ); + + // No bounding box for this one + unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) { + Self::copy_gob( + tiled, + linear, + Offset4D::new(0, 0, 0, 0), + Offset4D::new(0, 0, 0, 0) + Self::GOB_EXTENT_B, + ); + } +} + +/// Copies at most 16B of data to/from linear +trait Copy16B { + const X_DIVISOR: u32; + + unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize); + unsafe fn copy_16b(tiled: *mut [u8; 16], linear: *mut [u8; 16]) { + Self::copy(tiled as *mut _, linear as *mut _, 16); + } +} + +struct CopyGOBTuring2D { + phantom: std::marker::PhantomData, +} + +impl CopyGOBTuring2D { + fn for_each_16b(mut f: impl FnMut(u32, u32, u32)) { + for i in 0..2 { + f(i * 0x100 + 0x00, i * 32 + 0, 0); + f(i * 0x100 + 0x10, i * 32 + 0, 1); + f(i * 0x100 + 0x20, i * 32 + 0, 2); + f(i * 0x100 + 0x30, i * 32 + 0, 3); + + f(i * 0x100 + 0x40, i * 32 + 16, 0); + f(i * 0x100 + 0x50, i * 32 + 16, 1); + f(i * 0x100 + 0x60, i * 32 + 16, 2); + f(i * 0x100 + 0x70, i * 32 + 16, 3); + + f(i * 0x100 + 0x80, i * 32 + 0, 4); + f(i * 0x100 + 0x90, i * 32 + 0, 5); + f(i * 0x100 + 0xa0, i * 32 + 0, 6); + f(i * 0x100 + 0xb0, i * 32 + 0, 7); + + f(i * 0x100 + 0xc0, i * 32 + 16, 4); + f(i * 0x100 + 0xd0, i * 32 + 16, 5); + f(i * 0x100 + 0xe0, i * 32 + 16, 6); + f(i * 0x100 + 0xf0, i * 32 + 16, 7); + } + } +} + +impl CopyGOB for CopyGOBTuring2D { + const GOB_EXTENT_B: Extent4D = Extent4D::new(64, 8, 1, 1); + const X_DIVISOR: u32 = C::X_DIVISOR; + + unsafe fn copy_gob( + tiled: usize, + linear: LinearPointer, + start: Offset4D, + end: Offset4D, + ) { + Self::for_each_16b(|offset, x, y| { + if y >= start.y && y < end.y { + let tiled = tiled + (offset as usize); + let linear = linear.at(Offset4D::new(x, y, 0, 0)); + if x >= start.x && x + 16 <= end.x { + C::copy_16b(tiled as *mut _, linear as *mut _); + } else if x + 16 >= start.x && x < end.x { + let start = (std::cmp::max(x, start.x) - x) as usize; + let end = std::cmp::min(end.x - x, 16) as usize; + C::copy( + (tiled + start) as *mut _, + (linear + start) as *mut _, + end - start, + ); + } + } + }); + } + + unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) { + Self::for_each_16b(|offset, x, y| { + let tiled = tiled + (offset as usize); + let linear = linear.at(Offset4D::new(x, y, 0, 0)); + C::copy_16b(tiled as *mut _, linear as *mut _); + }); + } +} + +fn aligned_range(start: u32, end: u32, align: u32) -> Range { + debug_assert!(align.is_power_of_two()); + let align_1 = align - 1; + (start & !align_1)..((end + align_1) & !align_1) +} + +fn chunk_range( + whole: Range, + chunk_start: u32, + chunk_len: u32, +) -> Range { + debug_assert!(chunk_start < whole.end); + let start = if chunk_start < whole.start { + whole.start - chunk_start + } else { + 0 + }; + let end = std::cmp::min(whole.end - chunk_start, chunk_len); + start..end +} + +fn for_each_extent4d( + start: Offset4D, + end: Offset4D, + chunk: Extent4D, + mut f: impl FnMut(Offset4D, Offset4D, Offset4D), +) { + debug_assert!(chunk.width.is_power_of_two()); + debug_assert!(chunk.height.is_power_of_two()); + debug_assert!(chunk.depth.is_power_of_two()); + debug_assert!(chunk.array_len == 1); + + debug_assert!(start.a == 0); + debug_assert!(end.a == 1); + + let x_range = aligned_range(start.x, end.x, chunk.width); + let y_range = aligned_range(start.y, end.y, chunk.height); + let z_range = aligned_range(start.z, end.z, chunk.depth); + + for z in z_range.step_by(chunk.depth as usize) { + let chunk_z = chunk_range(start.z..end.z, z, chunk.depth); + for y in y_range.clone().step_by(chunk.height as usize) { + let chunk_y = chunk_range(start.y..end.y, y, chunk.height); + for x in x_range.clone().step_by(chunk.width as usize) { + let chunk_x = chunk_range(start.x..end.x, x, chunk.width); + let chunk_start = Offset4D::new(x, y, z, start.a); + let start = Offset4D::new( + chunk_x.start, + chunk_y.start, + chunk_z.start, + start.a, + ); + let end = + Offset4D::new(chunk_x.end, chunk_y.end, chunk_z.end, end.a); + f(chunk_start, start, end); + } + } + } +} + +fn for_each_extent4d_aligned( + start: Offset4D, + end: Offset4D, + chunk: Extent4D, + mut f: impl FnMut(Offset4D), +) { + debug_assert!(start.x % chunk.width == 0); + debug_assert!(start.y % chunk.height == 0); + debug_assert!(start.z % chunk.depth == 0); + debug_assert!(start.a == 0); + + debug_assert!(end.x % chunk.width == 0); + debug_assert!(end.y % chunk.height == 0); + debug_assert!(end.z % chunk.depth == 0); + debug_assert!(end.a == 1); + + debug_assert!(chunk.width.is_power_of_two()); + debug_assert!(chunk.height.is_power_of_two()); + debug_assert!(chunk.depth.is_power_of_two()); + debug_assert!(chunk.array_len == 1); + + for z in (start.z..end.z).step_by(chunk.depth as usize) { + for y in (start.y..end.y).step_by(chunk.height as usize) { + for x in (start.x..end.x).step_by(chunk.width as usize) { + f(Offset4D::new(x, y, z, start.a)); + } + } + } +} + +struct BlockPointer { + pointer: usize, + x_mul: usize, + y_mul: usize, + z_mul: usize, + #[cfg(debug_assertions)] + bl_extent: Extent4D, +} + +impl BlockPointer { + fn new( + pointer: usize, + bl_extent: Extent4D, + extent: Extent4D, + ) -> BlockPointer { + debug_assert!(bl_extent.array_len == 1); + + debug_assert!(extent.width % bl_extent.width == 0); + debug_assert!(extent.height % bl_extent.height == 0); + debug_assert!(extent.depth % bl_extent.depth == 0); + debug_assert!(extent.array_len == 1); + + BlockPointer { + pointer, + // We assume that offsets passed to at() are aligned to bl_extent so + // + // x_bl * bl_size_B + // = (x / bl_extent.width) * bl_size_B + // = x * (bl_size_B / bl_extent.width) + // = x * bl_extent.height * bl_extent.depth + x_mul: (bl_extent.height as usize) * (bl_extent.depth as usize), + + // y_bl * width_bl * bl_size_B + // (y / bl_extent.height) * width_bl * bl_size_B + // = y * (bl_size_B / bl_extent.height) * width_bl + // = y * bl_extent.width * bl_extent.depth * width_bl + // = y * (width_bl * bl_extent.width) * bl_extent.depth + // = x * extent.width * bl_extent.depth + y_mul: (extent.width as usize) * (bl_extent.depth as usize), + + // z_bl * width_bl * height_bl * bl_size_B + // = (z / bl_extent.depth) * width_bl * height_bl * bl_size_B + // = z * (bl_size_B / bl_extent.depth) * width_bl * height_bl + // = z * (bl_extent.width * bl_extent.height) * width_bl * height_bl + // = z * width_bl * bl_extent.width * height_bl * bl_extent.height + // = z * extent.width * extent.height + z_mul: (extent.width as usize) * (extent.height as usize), + + #[cfg(debug_assertions)] + bl_extent, + } + } + + #[inline] + fn at(&self, offset: Offset4D) -> usize { + #[cfg(debug_assertions)] + { + debug_assert!(offset.x % self.bl_extent.width == 0); + debug_assert!(offset.y % self.bl_extent.height == 0); + debug_assert!(offset.z % self.bl_extent.depth == 0); + debug_assert!(offset.a == 0); + } + + self.pointer + + (offset.z as usize) * self.z_mul + + (offset.y as usize) * self.y_mul + + (offset.x as usize) * self.x_mul + } +} + +#[derive(Copy, Clone)] +struct LinearPointer { + pointer: usize, + x_shift: u32, + row_stride_B: usize, + plane_stride_B: usize, +} + +impl LinearPointer { + fn new( + pointer: usize, + x_divisor: u32, + row_stride_B: usize, + plane_stride_B: usize, + ) -> LinearPointer { + debug_assert!(x_divisor.is_power_of_two()); + LinearPointer { + pointer, + x_shift: x_divisor.ilog2(), + row_stride_B, + plane_stride_B, + } + } + + fn x_divisor(&self) -> u32 { + 1 << self.x_shift + } + + #[inline] + fn reverse(self, offset: Offset4D) -> LinearPointer { + debug_assert!(offset.x % (1 << self.x_shift) == 0); + debug_assert!(offset.a == 0); + LinearPointer { + pointer: self + .pointer + .wrapping_sub((offset.z as usize) * self.plane_stride_B) + .wrapping_sub((offset.y as usize) * self.row_stride_B) + .wrapping_sub((offset.x >> self.x_shift) as usize), + x_shift: self.x_shift, + row_stride_B: self.row_stride_B, + plane_stride_B: self.plane_stride_B, + } + } + + #[inline] + fn at(self, offset: Offset4D) -> usize { + debug_assert!(offset.x % (1 << self.x_shift) == 0); + debug_assert!(offset.a == 0); + self.pointer + .wrapping_add((offset.z as usize) * self.plane_stride_B) + .wrapping_add((offset.y as usize) * self.row_stride_B) + .wrapping_add((offset.x >> self.x_shift) as usize) + } + + #[inline] + fn offset(self, offset: Offset4D) -> LinearPointer { + LinearPointer { + pointer: self.at(offset), + x_shift: self.x_shift, + row_stride_B: self.row_stride_B, + plane_stride_B: self.plane_stride_B, + } + } +} + +unsafe fn copy_tile( + tiling: Tiling, + tile_ptr: usize, + linear: LinearPointer, + start: Offset4D, + end: Offset4D, +) { + debug_assert!(linear.x_divisor() == CG::X_DIVISOR); + debug_assert!(tiling.gob_type.extent_B() == CG::GOB_EXTENT_B); + + let tile_extent_B = tiling.extent_B(); + let tile_ptr = BlockPointer::new(tile_ptr, CG::GOB_EXTENT_B, tile_extent_B); + + if start.is_aligned_to(CG::GOB_EXTENT_B) + && end.is_aligned_to(CG::GOB_EXTENT_B) + { + for_each_extent4d_aligned(start, end, CG::GOB_EXTENT_B, |gob| { + CG::copy_whole_gob(tile_ptr.at(gob), linear.offset(gob)); + }); + } else { + for_each_extent4d(start, end, CG::GOB_EXTENT_B, |gob, start, end| { + let tiled = tile_ptr.at(gob); + let linear = linear.offset(gob); + if start == Offset4D::new(0, 0, 0, 0) + && end == Offset4D::new(0, 0, 0, 0) + CG::GOB_EXTENT_B + { + CG::copy_whole_gob(tiled, linear); + } else { + CG::copy_gob(tiled, linear, start, end); + } + }); + } +} + +unsafe fn copy_tiled( + tiling: Tiling, + level_extent_B: Extent4D, + level_tiled_ptr: usize, + linear: LinearPointer, + start: Offset4D, + end: Offset4D, +) { + let tile_extent_B = tiling.extent_B(); + let level_extent_B = level_extent_B.align(&tile_extent_B); + + // Back up the linear pointer so it also points at the start of the level. + // This way, every step of the iteration can assume that both pointers + // point to the start chunk of the level, tile, or GOB. + let linear = linear.reverse(start); + + let level_tiled_ptr = + BlockPointer::new(level_tiled_ptr, tile_extent_B, level_extent_B); + + for_each_extent4d(start, end, tile_extent_B, |tile, start, end| { + let tile_ptr = level_tiled_ptr.at(tile); + let linear = linear.offset(tile); + copy_tile::(tiling, tile_ptr, linear, start, end); + }); +} + +struct RawCopyToTiled {} + +impl Copy16B for RawCopyToTiled { + const X_DIVISOR: u32 = 1; + + unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) { + // This is backwards from memcpy + std::ptr::copy_nonoverlapping(linear, tiled, bytes); + } +} + +struct RawCopyToLinear {} + +impl Copy16B for RawCopyToLinear { + const X_DIVISOR: u32 = 1; + + unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) { + // This is backwards from memcpy + std::ptr::copy_nonoverlapping(tiled, linear, bytes); + } +} + +#[no_mangle] +pub unsafe extern "C" fn nil_copy_linear_to_tiled( + tiled_dst: *mut c_void, + level_extent_B: Extent4D, + linear_src: *const c_void, + linear_row_stride_B: usize, + linear_plane_stride_B: usize, + offset_B: Offset4D, + extent_B: Extent4D, + tiling: &Tiling, +) { + let end_B = offset_B + extent_B; + + let linear_src = linear_src as usize; + let tiled_dst = tiled_dst as usize; + let linear_pointer = LinearPointer::new( + linear_src, + 1, + linear_row_stride_B, + linear_plane_stride_B, + ); + + copy_tiled::>( + *tiling, + level_extent_B, + tiled_dst, + linear_pointer, + offset_B, + end_B, + ); +} + +#[no_mangle] +pub unsafe extern "C" fn nil_copy_tiled_to_linear( + linear_dst: *mut c_void, + linear_row_stride_B: usize, + linear_plane_stride_B: usize, + tiled_src: *const c_void, + level_extent_B: Extent4D, + offset_B: Offset4D, + extent_B: Extent4D, + tiling: &Tiling, +) { + let mut end_B = offset_B + extent_B; + end_B.a = 1; + let linear_dst = linear_dst as usize; + let tiled_src = tiled_src as usize; + let linear_pointer = LinearPointer::new( + linear_dst, + 1, + linear_row_stride_B, + linear_plane_stride_B, + ); + + copy_tiled::>( + *tiling, + level_extent_B, + tiled_src, + linear_pointer, + offset_B, + end_B, + ); +} diff --git a/src/nouveau/nil/lib.rs b/src/nouveau/nil/lib.rs index cd62f79e9a6..58c8628f145 100644 --- a/src/nouveau/nil/lib.rs +++ b/src/nouveau/nil/lib.rs @@ -4,6 +4,7 @@ extern crate nil_rs_bindings; extern crate nvidia_headers; +mod copy; mod extent; mod format; mod image;