nil: Add tiled memcpy helpers

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30044>
This commit is contained in:
Faith Ekstrand 2024-06-01 21:23:23 +03:00 committed by Marge Bot
parent 1c131de30e
commit b99f28b7d6
2 changed files with 595 additions and 0 deletions

594
src/nouveau/nil/copy.rs Normal file
View file

@ -0,0 +1,594 @@
// Copyright © 2024 Valve Corp. and Collabora, Ltd.
// SPDX-License-Identifier: MIT
use crate::extent::{units, Extent4D, Offset4D};
use crate::tiling::Tiling;
use std::ffi::c_void;
use std::ops::Range;
// This file is dedicated to the internal tiling layout, mainly in the context
// of CPU-based tiled memcpy implementations (and helpers) for VK_EXT_host_image_copy
//
// Work here is based on isl_tiled_memcpy, fd6_tiled_memcpy, old work by Rebecca Mckeever,
// and https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/
//
// On NVIDIA, the tiling system is a two-tier one, and images are first tiled in
// a grid of rows of tiles (called "Blocks") with one or more columns:
//
// +----------+----------+----------+----------+
// | Block 0 | Block 1 | Block 2 | Block 3 |
// +----------+----------+----------+----------+
// | Block 4 | Block 5 | Block 6 | Block 7 |
// +----------+----------+----------+----------+
// | Block 8 | Block 9 | Block 10 | Block 11 |
// +----------+----------+----------+----------+
//
// The blocks themselves are ordered linearly as can be seen above, which is
// where the "Block Linear" naming comes from for NVIDIA's tiling scheme.
//
// For 3D images, each block continues in the Z direction such that tiles
// contain multiple Z slices. If the image depth is longer than the
// block depth, there will be more than one layer of blocks, where a layer is
// made up of 1 or more Z slices. For example, if the above tile pattern was
// the first layer of a multilayer arrangement, the second layer would be:
//
// +----------+----------+----------+----------+
// | Block 12 | Block 13 | Block 14 | Block 15 |
// +----------+----------+----------+----------+
// | Block 16 | Block 17 | Block 18 | Block 19 |
// +----------+----------+----------+----------+
// | Block 20 | Block 21 | Block 22 | Block 23 |
// +----------+----------+----------+----------+
//
// The number of rows, columns, and layers of tiles can thus be deduced to be:
// rows >= ceiling(image_height / block_height)
// columns >= ceiling(image_width / block_width)
// layers >= ceiling(image_depth / block_depth)
//
// Where block_width is a constant 64B (unless for sparse) and block_height
// can be either 8 or 16 GOBs tall (more on GOBs below). For us, block_depth
// is one for now.
//
// The >= is in case the blocks around the edges are partial.
//
// Now comes the second tier. Each block is composed of GOBs (Groups of Bytes)
// arranged in ascending order in a single column:
//
// +---------------------------+
// | GOB 0 |
// +---------------------------+
// | GOB 1 |
// +---------------------------+
// | GOB 2 |
// +---------------------------+
// | GOB 3 |
// +---------------------------+
//
// The number of GOBs in a full block is
// block_height * block_depth
//
// An Ampere GOB is 512 bytes, arranged in a 64x8 layout and split into Sectors.
// Each Sector is 32 Bytes, and the Sectors in a GOB are arranged in a 16x2
// layout (i.e., two 16B lines on top of each other). It's then arranged into
// two columns that are 2 sectors by 4, leading to a 4x4 grid of sectors:
//
// +----------+----------+----------+----------+
// | Sector 0 | Sector 1 | Sector 0 | Sector 1 |
// +----------+----------+----------+----------+
// | Sector 2 | Sector 3 | Sector 2 | Sector 3 |
// +----------+----------+----------+----------+
// | Sector 4 | Sector 5 | Sector 4 | Sector 5 |
// +----------+----------+----------+----------+
// | Sector 6 | Sector 7 | Sector 6 | Sector 7 |
// +----------+----------+----------+----------+
//
// From the given pixel address equations in the Orin manual, we arrived at
// the following bit interleave pattern for the pixel address:
//
// b8 b7 b6 b5 b4 b3 b2 b1 b0
// --------------------------
// x5 y2 y1 x4 y0 x3 x2 x1 x0
//
// Which would look something like this:
// fn get_pixel_offset(
// x: usize,
// y: usize,
// ) -> usize {
// (x & 15) |
// (y & 1) << 4 |
// (x & 16) << 1 |
// (y & 2) << 5 |
// (x & 32) << 3
// }
//
//
// The way our implementation will work is by splitting an image into tiles, then
// each tile will be broken into its GOBs, and finally each GOB into sectors,
// where each sector will be copied into its position.
//
// For code sharing and cleanliness, we write everything to be very generic,
// so as to be shared between Linear <-> Tiled and Tiled <-> Linear paths, and
// (ab)use Rust's traits to specialize the last level (copy_gob/copy_whole_gob)
// for a particular direction.
//
// The copy_x and copy_whole_x distinction is made because if we can guarantee
// that tiles/gobs are whole and aligned, we can skip all bounds checking and
// copy things in fast and tight loops
/// Copies a GOB
///
/// This trait should be implemented twice for each GOB type, once for
/// tiled-to-linear and once for linear-to-tiled. This allows to implement
/// the rest of tiled copies in a generic way.
trait CopyGOB {
const GOB_EXTENT_B: Extent4D<units::Bytes>;
const X_DIVISOR: u32;
unsafe fn copy_gob(
tiled: usize,
linear: LinearPointer,
start: Offset4D<units::Bytes>,
end: Offset4D<units::Bytes>,
);
// No bounding box for this one
unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) {
Self::copy_gob(
tiled,
linear,
Offset4D::new(0, 0, 0, 0),
Offset4D::new(0, 0, 0, 0) + Self::GOB_EXTENT_B,
);
}
}
/// Copies at most 16B of data to/from linear
trait Copy16B {
const X_DIVISOR: u32;
unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize);
unsafe fn copy_16b(tiled: *mut [u8; 16], linear: *mut [u8; 16]) {
Self::copy(tiled as *mut _, linear as *mut _, 16);
}
}
struct CopyGOBTuring2D<C: Copy16B> {
phantom: std::marker::PhantomData<C>,
}
impl<C: Copy16B> CopyGOBTuring2D<C> {
fn for_each_16b(mut f: impl FnMut(u32, u32, u32)) {
for i in 0..2 {
f(i * 0x100 + 0x00, i * 32 + 0, 0);
f(i * 0x100 + 0x10, i * 32 + 0, 1);
f(i * 0x100 + 0x20, i * 32 + 0, 2);
f(i * 0x100 + 0x30, i * 32 + 0, 3);
f(i * 0x100 + 0x40, i * 32 + 16, 0);
f(i * 0x100 + 0x50, i * 32 + 16, 1);
f(i * 0x100 + 0x60, i * 32 + 16, 2);
f(i * 0x100 + 0x70, i * 32 + 16, 3);
f(i * 0x100 + 0x80, i * 32 + 0, 4);
f(i * 0x100 + 0x90, i * 32 + 0, 5);
f(i * 0x100 + 0xa0, i * 32 + 0, 6);
f(i * 0x100 + 0xb0, i * 32 + 0, 7);
f(i * 0x100 + 0xc0, i * 32 + 16, 4);
f(i * 0x100 + 0xd0, i * 32 + 16, 5);
f(i * 0x100 + 0xe0, i * 32 + 16, 6);
f(i * 0x100 + 0xf0, i * 32 + 16, 7);
}
}
}
impl<C: Copy16B> CopyGOB for CopyGOBTuring2D<C> {
const GOB_EXTENT_B: Extent4D<units::Bytes> = Extent4D::new(64, 8, 1, 1);
const X_DIVISOR: u32 = C::X_DIVISOR;
unsafe fn copy_gob(
tiled: usize,
linear: LinearPointer,
start: Offset4D<units::Bytes>,
end: Offset4D<units::Bytes>,
) {
Self::for_each_16b(|offset, x, y| {
if y >= start.y && y < end.y {
let tiled = tiled + (offset as usize);
let linear = linear.at(Offset4D::new(x, y, 0, 0));
if x >= start.x && x + 16 <= end.x {
C::copy_16b(tiled as *mut _, linear as *mut _);
} else if x + 16 >= start.x && x < end.x {
let start = (std::cmp::max(x, start.x) - x) as usize;
let end = std::cmp::min(end.x - x, 16) as usize;
C::copy(
(tiled + start) as *mut _,
(linear + start) as *mut _,
end - start,
);
}
}
});
}
unsafe fn copy_whole_gob(tiled: usize, linear: LinearPointer) {
Self::for_each_16b(|offset, x, y| {
let tiled = tiled + (offset as usize);
let linear = linear.at(Offset4D::new(x, y, 0, 0));
C::copy_16b(tiled as *mut _, linear as *mut _);
});
}
}
fn aligned_range(start: u32, end: u32, align: u32) -> Range<u32> {
debug_assert!(align.is_power_of_two());
let align_1 = align - 1;
(start & !align_1)..((end + align_1) & !align_1)
}
fn chunk_range(
whole: Range<u32>,
chunk_start: u32,
chunk_len: u32,
) -> Range<u32> {
debug_assert!(chunk_start < whole.end);
let start = if chunk_start < whole.start {
whole.start - chunk_start
} else {
0
};
let end = std::cmp::min(whole.end - chunk_start, chunk_len);
start..end
}
fn for_each_extent4d<U>(
start: Offset4D<U>,
end: Offset4D<U>,
chunk: Extent4D<U>,
mut f: impl FnMut(Offset4D<U>, Offset4D<U>, Offset4D<U>),
) {
debug_assert!(chunk.width.is_power_of_two());
debug_assert!(chunk.height.is_power_of_two());
debug_assert!(chunk.depth.is_power_of_two());
debug_assert!(chunk.array_len == 1);
debug_assert!(start.a == 0);
debug_assert!(end.a == 1);
let x_range = aligned_range(start.x, end.x, chunk.width);
let y_range = aligned_range(start.y, end.y, chunk.height);
let z_range = aligned_range(start.z, end.z, chunk.depth);
for z in z_range.step_by(chunk.depth as usize) {
let chunk_z = chunk_range(start.z..end.z, z, chunk.depth);
for y in y_range.clone().step_by(chunk.height as usize) {
let chunk_y = chunk_range(start.y..end.y, y, chunk.height);
for x in x_range.clone().step_by(chunk.width as usize) {
let chunk_x = chunk_range(start.x..end.x, x, chunk.width);
let chunk_start = Offset4D::new(x, y, z, start.a);
let start = Offset4D::new(
chunk_x.start,
chunk_y.start,
chunk_z.start,
start.a,
);
let end =
Offset4D::new(chunk_x.end, chunk_y.end, chunk_z.end, end.a);
f(chunk_start, start, end);
}
}
}
}
fn for_each_extent4d_aligned<U>(
start: Offset4D<U>,
end: Offset4D<U>,
chunk: Extent4D<U>,
mut f: impl FnMut(Offset4D<U>),
) {
debug_assert!(start.x % chunk.width == 0);
debug_assert!(start.y % chunk.height == 0);
debug_assert!(start.z % chunk.depth == 0);
debug_assert!(start.a == 0);
debug_assert!(end.x % chunk.width == 0);
debug_assert!(end.y % chunk.height == 0);
debug_assert!(end.z % chunk.depth == 0);
debug_assert!(end.a == 1);
debug_assert!(chunk.width.is_power_of_two());
debug_assert!(chunk.height.is_power_of_two());
debug_assert!(chunk.depth.is_power_of_two());
debug_assert!(chunk.array_len == 1);
for z in (start.z..end.z).step_by(chunk.depth as usize) {
for y in (start.y..end.y).step_by(chunk.height as usize) {
for x in (start.x..end.x).step_by(chunk.width as usize) {
f(Offset4D::new(x, y, z, start.a));
}
}
}
}
struct BlockPointer {
pointer: usize,
x_mul: usize,
y_mul: usize,
z_mul: usize,
#[cfg(debug_assertions)]
bl_extent: Extent4D<units::Bytes>,
}
impl BlockPointer {
fn new(
pointer: usize,
bl_extent: Extent4D<units::Bytes>,
extent: Extent4D<units::Bytes>,
) -> BlockPointer {
debug_assert!(bl_extent.array_len == 1);
debug_assert!(extent.width % bl_extent.width == 0);
debug_assert!(extent.height % bl_extent.height == 0);
debug_assert!(extent.depth % bl_extent.depth == 0);
debug_assert!(extent.array_len == 1);
BlockPointer {
pointer,
// We assume that offsets passed to at() are aligned to bl_extent so
//
// x_bl * bl_size_B
// = (x / bl_extent.width) * bl_size_B
// = x * (bl_size_B / bl_extent.width)
// = x * bl_extent.height * bl_extent.depth
x_mul: (bl_extent.height as usize) * (bl_extent.depth as usize),
// y_bl * width_bl * bl_size_B
// (y / bl_extent.height) * width_bl * bl_size_B
// = y * (bl_size_B / bl_extent.height) * width_bl
// = y * bl_extent.width * bl_extent.depth * width_bl
// = y * (width_bl * bl_extent.width) * bl_extent.depth
// = x * extent.width * bl_extent.depth
y_mul: (extent.width as usize) * (bl_extent.depth as usize),
// z_bl * width_bl * height_bl * bl_size_B
// = (z / bl_extent.depth) * width_bl * height_bl * bl_size_B
// = z * (bl_size_B / bl_extent.depth) * width_bl * height_bl
// = z * (bl_extent.width * bl_extent.height) * width_bl * height_bl
// = z * width_bl * bl_extent.width * height_bl * bl_extent.height
// = z * extent.width * extent.height
z_mul: (extent.width as usize) * (extent.height as usize),
#[cfg(debug_assertions)]
bl_extent,
}
}
#[inline]
fn at(&self, offset: Offset4D<units::Bytes>) -> usize {
#[cfg(debug_assertions)]
{
debug_assert!(offset.x % self.bl_extent.width == 0);
debug_assert!(offset.y % self.bl_extent.height == 0);
debug_assert!(offset.z % self.bl_extent.depth == 0);
debug_assert!(offset.a == 0);
}
self.pointer
+ (offset.z as usize) * self.z_mul
+ (offset.y as usize) * self.y_mul
+ (offset.x as usize) * self.x_mul
}
}
#[derive(Copy, Clone)]
struct LinearPointer {
pointer: usize,
x_shift: u32,
row_stride_B: usize,
plane_stride_B: usize,
}
impl LinearPointer {
fn new(
pointer: usize,
x_divisor: u32,
row_stride_B: usize,
plane_stride_B: usize,
) -> LinearPointer {
debug_assert!(x_divisor.is_power_of_two());
LinearPointer {
pointer,
x_shift: x_divisor.ilog2(),
row_stride_B,
plane_stride_B,
}
}
fn x_divisor(&self) -> u32 {
1 << self.x_shift
}
#[inline]
fn reverse(self, offset: Offset4D<units::Bytes>) -> LinearPointer {
debug_assert!(offset.x % (1 << self.x_shift) == 0);
debug_assert!(offset.a == 0);
LinearPointer {
pointer: self
.pointer
.wrapping_sub((offset.z as usize) * self.plane_stride_B)
.wrapping_sub((offset.y as usize) * self.row_stride_B)
.wrapping_sub((offset.x >> self.x_shift) as usize),
x_shift: self.x_shift,
row_stride_B: self.row_stride_B,
plane_stride_B: self.plane_stride_B,
}
}
#[inline]
fn at(self, offset: Offset4D<units::Bytes>) -> usize {
debug_assert!(offset.x % (1 << self.x_shift) == 0);
debug_assert!(offset.a == 0);
self.pointer
.wrapping_add((offset.z as usize) * self.plane_stride_B)
.wrapping_add((offset.y as usize) * self.row_stride_B)
.wrapping_add((offset.x >> self.x_shift) as usize)
}
#[inline]
fn offset(self, offset: Offset4D<units::Bytes>) -> LinearPointer {
LinearPointer {
pointer: self.at(offset),
x_shift: self.x_shift,
row_stride_B: self.row_stride_B,
plane_stride_B: self.plane_stride_B,
}
}
}
unsafe fn copy_tile<CG: CopyGOB>(
tiling: Tiling,
tile_ptr: usize,
linear: LinearPointer,
start: Offset4D<units::Bytes>,
end: Offset4D<units::Bytes>,
) {
debug_assert!(linear.x_divisor() == CG::X_DIVISOR);
debug_assert!(tiling.gob_type.extent_B() == CG::GOB_EXTENT_B);
let tile_extent_B = tiling.extent_B();
let tile_ptr = BlockPointer::new(tile_ptr, CG::GOB_EXTENT_B, tile_extent_B);
if start.is_aligned_to(CG::GOB_EXTENT_B)
&& end.is_aligned_to(CG::GOB_EXTENT_B)
{
for_each_extent4d_aligned(start, end, CG::GOB_EXTENT_B, |gob| {
CG::copy_whole_gob(tile_ptr.at(gob), linear.offset(gob));
});
} else {
for_each_extent4d(start, end, CG::GOB_EXTENT_B, |gob, start, end| {
let tiled = tile_ptr.at(gob);
let linear = linear.offset(gob);
if start == Offset4D::new(0, 0, 0, 0)
&& end == Offset4D::new(0, 0, 0, 0) + CG::GOB_EXTENT_B
{
CG::copy_whole_gob(tiled, linear);
} else {
CG::copy_gob(tiled, linear, start, end);
}
});
}
}
unsafe fn copy_tiled<CG: CopyGOB>(
tiling: Tiling,
level_extent_B: Extent4D<units::Bytes>,
level_tiled_ptr: usize,
linear: LinearPointer,
start: Offset4D<units::Bytes>,
end: Offset4D<units::Bytes>,
) {
let tile_extent_B = tiling.extent_B();
let level_extent_B = level_extent_B.align(&tile_extent_B);
// Back up the linear pointer so it also points at the start of the level.
// This way, every step of the iteration can assume that both pointers
// point to the start chunk of the level, tile, or GOB.
let linear = linear.reverse(start);
let level_tiled_ptr =
BlockPointer::new(level_tiled_ptr, tile_extent_B, level_extent_B);
for_each_extent4d(start, end, tile_extent_B, |tile, start, end| {
let tile_ptr = level_tiled_ptr.at(tile);
let linear = linear.offset(tile);
copy_tile::<CG>(tiling, tile_ptr, linear, start, end);
});
}
struct RawCopyToTiled {}
impl Copy16B for RawCopyToTiled {
const X_DIVISOR: u32 = 1;
unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) {
// This is backwards from memcpy
std::ptr::copy_nonoverlapping(linear, tiled, bytes);
}
}
struct RawCopyToLinear {}
impl Copy16B for RawCopyToLinear {
const X_DIVISOR: u32 = 1;
unsafe fn copy(tiled: *mut u8, linear: *mut u8, bytes: usize) {
// This is backwards from memcpy
std::ptr::copy_nonoverlapping(tiled, linear, bytes);
}
}
#[no_mangle]
pub unsafe extern "C" fn nil_copy_linear_to_tiled(
tiled_dst: *mut c_void,
level_extent_B: Extent4D<units::Bytes>,
linear_src: *const c_void,
linear_row_stride_B: usize,
linear_plane_stride_B: usize,
offset_B: Offset4D<units::Bytes>,
extent_B: Extent4D<units::Bytes>,
tiling: &Tiling,
) {
let end_B = offset_B + extent_B;
let linear_src = linear_src as usize;
let tiled_dst = tiled_dst as usize;
let linear_pointer = LinearPointer::new(
linear_src,
1,
linear_row_stride_B,
linear_plane_stride_B,
);
copy_tiled::<CopyGOBTuring2D<RawCopyToTiled>>(
*tiling,
level_extent_B,
tiled_dst,
linear_pointer,
offset_B,
end_B,
);
}
#[no_mangle]
pub unsafe extern "C" fn nil_copy_tiled_to_linear(
linear_dst: *mut c_void,
linear_row_stride_B: usize,
linear_plane_stride_B: usize,
tiled_src: *const c_void,
level_extent_B: Extent4D<units::Bytes>,
offset_B: Offset4D<units::Bytes>,
extent_B: Extent4D<units::Bytes>,
tiling: &Tiling,
) {
let mut end_B = offset_B + extent_B;
end_B.a = 1;
let linear_dst = linear_dst as usize;
let tiled_src = tiled_src as usize;
let linear_pointer = LinearPointer::new(
linear_dst,
1,
linear_row_stride_B,
linear_plane_stride_B,
);
copy_tiled::<CopyGOBTuring2D<RawCopyToLinear>>(
*tiling,
level_extent_B,
tiled_src,
linear_pointer,
offset_B,
end_B,
);
}

View file

@ -4,6 +4,7 @@
extern crate nil_rs_bindings;
extern crate nvidia_headers;
mod copy;
mod extent;
mod format;
mod image;