mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
nak: Add surface address ops
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34975>
This commit is contained in:
parent
82d789d22a
commit
ac3fd5768b
4 changed files with 996 additions and 6 deletions
|
|
@ -2534,6 +2534,155 @@ impl<'a> ShaderFromNir<'a> {
|
|||
});
|
||||
self.set_dst(&intrin.def, dst);
|
||||
}
|
||||
nir_intrinsic_suclamp_nv => {
|
||||
let coords = self.get_src(&srcs[0]);
|
||||
let params = self.get_src(&srcs[1]);
|
||||
|
||||
let flags = intrin.flags();
|
||||
let flags: nak_nir_suclamp_flags =
|
||||
unsafe { std::mem::transmute_copy(&flags) };
|
||||
|
||||
let mode = match flags.mode() {
|
||||
NAK_SUCLAMP_MODE_BLOCK_LINEAR => SuClampMode::BlockLinear,
|
||||
NAK_SUCLAMP_MODE_PITCH_LINEAR => SuClampMode::PitchLinear,
|
||||
NAK_SUCLAMP_MODE_STORED_DESCRIPTOR => {
|
||||
SuClampMode::StoredInDescriptor
|
||||
}
|
||||
_ => panic!("Invalid suclamp mode"),
|
||||
};
|
||||
|
||||
let round = match flags.round() {
|
||||
NAK_SUCLAMP_ROUND_R1 => SuClampRound::R1,
|
||||
NAK_SUCLAMP_ROUND_R2 => SuClampRound::R2,
|
||||
NAK_SUCLAMP_ROUND_R4 => SuClampRound::R4,
|
||||
NAK_SUCLAMP_ROUND_R8 => SuClampRound::R8,
|
||||
NAK_SUCLAMP_ROUND_R16 => SuClampRound::R16,
|
||||
_ => panic!("Invalid suclamp round"),
|
||||
};
|
||||
|
||||
let dst = b.alloc_ssa(RegFile::GPR);
|
||||
let out_of_bounds = b.alloc_ssa(RegFile::Pred);
|
||||
b.push_op(OpSuClamp {
|
||||
dst: dst.into(),
|
||||
out_of_bounds: out_of_bounds.into(),
|
||||
coords,
|
||||
params,
|
||||
mode,
|
||||
round,
|
||||
is_2d: flags.is_2d(),
|
||||
is_s32: flags.is_s32(),
|
||||
imm: 0,
|
||||
});
|
||||
let final_dst =
|
||||
vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
|
||||
|
||||
self.set_ssa(&intrin.def, final_dst);
|
||||
}
|
||||
nir_intrinsic_subfm_nv => {
|
||||
let x = self.get_src(&srcs[0]);
|
||||
let y = self.get_src(&srcs[1]);
|
||||
let z = self.get_src(&srcs[2]);
|
||||
let is_3d = intrin.flags() != 0;
|
||||
|
||||
let dst = b.alloc_ssa(RegFile::GPR);
|
||||
let out_of_bounds = b.alloc_ssa(RegFile::Pred);
|
||||
b.push_op(OpSuBfm {
|
||||
dst: dst.into(),
|
||||
pdst: out_of_bounds.into(),
|
||||
srcs: [x, y, z],
|
||||
is_3d,
|
||||
});
|
||||
let final_dst =
|
||||
vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
|
||||
|
||||
self.set_ssa(&intrin.def, final_dst);
|
||||
}
|
||||
nir_intrinsic_sueau_nv => {
|
||||
let off = self.get_src(&srcs[0]);
|
||||
let bit_field = self.get_src(&srcs[1]);
|
||||
let addr = self.get_src(&srcs[2]);
|
||||
|
||||
let dst = b.alloc_ssa(RegFile::GPR);
|
||||
b.push_op(OpSuEau {
|
||||
dst: dst.into(),
|
||||
off,
|
||||
bit_field,
|
||||
addr,
|
||||
});
|
||||
self.set_dst(&intrin.def, dst.into());
|
||||
}
|
||||
nir_intrinsic_imadsp_nv => {
|
||||
let src0 = self.get_src(&srcs[0]);
|
||||
let src1 = self.get_src(&srcs[1]);
|
||||
let src2 = self.get_src(&srcs[2]);
|
||||
|
||||
let flags = intrin.flags();
|
||||
let flags: nak_nir_imadsp_flags =
|
||||
unsafe { std::mem::transmute_copy(&flags) };
|
||||
|
||||
let translate_src_type = |s| {
|
||||
use IMadSpSrcType::*;
|
||||
match s {
|
||||
NAK_IMAD_TYPE_U32 => U32,
|
||||
NAK_IMAD_TYPE_U24 => U24,
|
||||
NAK_IMAD_TYPE_U16_LO => U16Lo,
|
||||
NAK_IMAD_TYPE_U16_HI => U16Hi,
|
||||
NAK_IMAD_TYPE_S32 => S32,
|
||||
NAK_IMAD_TYPE_S24 => S24,
|
||||
NAK_IMAD_TYPE_S16_LO => S16Lo,
|
||||
NAK_IMAD_TYPE_S16_HI => S16Hi,
|
||||
_ => panic!("Invalid imadsp mode"),
|
||||
}
|
||||
};
|
||||
|
||||
let mode = if flags.params_from_src1() {
|
||||
IMadSpMode::FromSrc1
|
||||
} else {
|
||||
IMadSpMode::Explicit([
|
||||
translate_src_type(flags.src0()),
|
||||
translate_src_type(flags.src1()),
|
||||
translate_src_type(flags.src2()),
|
||||
])
|
||||
};
|
||||
|
||||
let dst = b.alloc_ssa(RegFile::GPR);
|
||||
b.push_op(OpIMadSp {
|
||||
srcs: [src0, src1, src2],
|
||||
dst: dst.into(),
|
||||
mode,
|
||||
});
|
||||
self.set_dst(&intrin.def, dst.into());
|
||||
}
|
||||
nir_intrinsic_suldga_nv => {
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let format = self.get_src(&srcs[1]);
|
||||
let out_of_bounds = self.get_src(&srcs[2]);
|
||||
|
||||
let comps = intrin.num_components;
|
||||
|
||||
assert!(intrin.def.bit_size() == 32);
|
||||
let mem_type = self.get_image_mem_type(intrin);
|
||||
|
||||
let flags = intrin.flags();
|
||||
let offset_mode = match flags {
|
||||
NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
|
||||
NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
|
||||
NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
|
||||
NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
|
||||
_ => panic!("Invalid suldga flags"),
|
||||
};
|
||||
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
|
||||
b.push_op(OpSuLdGa {
|
||||
dst: dst.clone().into(),
|
||||
addr,
|
||||
format,
|
||||
out_of_bounds,
|
||||
mem_type,
|
||||
offset_mode,
|
||||
});
|
||||
self.set_dst(&intrin.def, dst);
|
||||
}
|
||||
nir_intrinsic_bindless_image_load
|
||||
| nir_intrinsic_bindless_image_load_raw_nv => {
|
||||
let handle = self.get_src(&srcs[0]);
|
||||
|
|
@ -2624,6 +2773,33 @@ impl<'a> ShaderFromNir<'a> {
|
|||
|
||||
self.set_ssa(&intrin.def, final_dst);
|
||||
}
|
||||
nir_intrinsic_sustga_nv => {
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let format = self.get_src(&srcs[1]);
|
||||
let out_of_bounds = self.get_src(&srcs[2]);
|
||||
|
||||
let data = self.get_src(&srcs[3]);
|
||||
let image_access =
|
||||
ImageAccess::Formatted(ChannelMask::new(0xf));
|
||||
|
||||
let flags = intrin.flags();
|
||||
let offset_mode = match flags {
|
||||
NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
|
||||
NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
|
||||
NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
|
||||
NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
|
||||
_ => panic!("Invalid sustga flags"),
|
||||
};
|
||||
|
||||
b.push_op(OpSuStGa {
|
||||
addr,
|
||||
format,
|
||||
data,
|
||||
out_of_bounds,
|
||||
image_access,
|
||||
offset_mode,
|
||||
});
|
||||
}
|
||||
nir_intrinsic_bindless_image_store => {
|
||||
let handle = self.get_src(&srcs[0]);
|
||||
let dim = self.get_image_dim(intrin);
|
||||
|
|
|
|||
|
|
@ -1180,6 +1180,126 @@ fn test_iadd64() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_op_suclamp() {
|
||||
if !RunSingleton::get().sm.is_kepler() {
|
||||
return;
|
||||
}
|
||||
|
||||
// We cannot test every single combination of options.
|
||||
// Use a random generator for rounding and immediate
|
||||
let mut a = Acorn::new();
|
||||
for mode in [
|
||||
SuClampMode::StoredInDescriptor,
|
||||
SuClampMode::PitchLinear,
|
||||
SuClampMode::BlockLinear,
|
||||
] {
|
||||
for i in 0..4 {
|
||||
let is_s32 = (i & (1 << 0)) != 0;
|
||||
let is_2d = (i & (1 << 1)) != 0;
|
||||
// immediate is an i6 value
|
||||
let imm = (a.get_u32() % 64) as i8 - 32;
|
||||
let round = match a.get_u32() % 5 {
|
||||
0 => SuClampRound::R1,
|
||||
1 => SuClampRound::R2,
|
||||
2 => SuClampRound::R4,
|
||||
3 => SuClampRound::R8,
|
||||
_ => SuClampRound::R16,
|
||||
};
|
||||
|
||||
let op = OpSuClamp {
|
||||
dst: Dst::None,
|
||||
out_of_bounds: Dst::None,
|
||||
mode,
|
||||
round,
|
||||
is_s32,
|
||||
is_2d,
|
||||
coords: 0.into(),
|
||||
params: 0.into(),
|
||||
imm,
|
||||
};
|
||||
|
||||
test_foldable_op(op);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_op_subfm() {
|
||||
if !RunSingleton::get().sm.is_kepler() {
|
||||
return;
|
||||
}
|
||||
|
||||
for is_3d in [false, true] {
|
||||
let op = OpSuBfm {
|
||||
dst: Dst::None,
|
||||
pdst: Dst::None,
|
||||
srcs: [0.into(), 0.into(), 0.into()],
|
||||
is_3d,
|
||||
};
|
||||
|
||||
test_foldable_op(op);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_op_sueau() {
|
||||
if !RunSingleton::get().sm.is_kepler() {
|
||||
return;
|
||||
}
|
||||
|
||||
let op = OpSuEau {
|
||||
dst: Dst::None,
|
||||
off: 0.into(),
|
||||
bit_field: 0.into(),
|
||||
addr: 0.into(),
|
||||
};
|
||||
|
||||
test_foldable_op(op);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_op_imadsp() {
|
||||
if !RunSingleton::get().sm.is_kepler() {
|
||||
return;
|
||||
}
|
||||
|
||||
use IMadSpSrcType::*;
|
||||
let src0_w = [U32, U24, U16Lo, U16Hi];
|
||||
let src1_w = [U24, U16Lo];
|
||||
let src2_w = [U32, U24, U16Lo];
|
||||
|
||||
let mut modes = vec![];
|
||||
|
||||
// Cartesian product
|
||||
for w0 in src0_w {
|
||||
for w1 in src1_w {
|
||||
for w2 in src2_w {
|
||||
for sign in 0..4 {
|
||||
let s0 = (sign & 0x1) != 0;
|
||||
let s1 = (sign & 0x2) != 0;
|
||||
let s2 = s0 || s1;
|
||||
modes.push(IMadSpMode::Explicit([
|
||||
w0.with_sign(s0),
|
||||
w1.with_sign(s1),
|
||||
w2.with_sign(s2),
|
||||
]))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
modes.push(IMadSpMode::FromSrc1);
|
||||
|
||||
for mode in modes {
|
||||
let op = OpIMadSp {
|
||||
dst: Dst::None,
|
||||
srcs: [0.into(), 0.into(), 0.into()],
|
||||
mode,
|
||||
};
|
||||
test_foldable_op(op);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ineg64() {
|
||||
let run = RunSingleton::get();
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
extern crate bitview;
|
||||
extern crate nak_ir_proc;
|
||||
|
||||
use bitview::{BitMutView, BitView};
|
||||
use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField};
|
||||
use nak_bindings::*;
|
||||
|
||||
pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
|
||||
|
|
@ -5183,6 +5183,669 @@ impl DisplayOp for OpSuAtom {
|
|||
}
|
||||
impl_display_for_op!(OpSuAtom);
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum SuClampMode {
|
||||
StoredInDescriptor,
|
||||
PitchLinear,
|
||||
BlockLinear,
|
||||
}
|
||||
|
||||
impl fmt::Display for SuClampMode {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let s = match self {
|
||||
SuClampMode::StoredInDescriptor => ".sd",
|
||||
SuClampMode::PitchLinear => ".pl",
|
||||
SuClampMode::BlockLinear => ".bl",
|
||||
};
|
||||
write!(f, "{}", s)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum SuClampRound {
|
||||
R1,
|
||||
R2,
|
||||
R4,
|
||||
R8,
|
||||
R16,
|
||||
}
|
||||
|
||||
impl SuClampRound {
|
||||
pub fn to_int(&self) -> u8 {
|
||||
match self {
|
||||
SuClampRound::R1 => 1,
|
||||
SuClampRound::R2 => 2,
|
||||
SuClampRound::R4 => 4,
|
||||
SuClampRound::R8 => 8,
|
||||
SuClampRound::R16 => 16,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_mask(&self) -> u32 {
|
||||
!(self.to_int() as u32 - 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SuClampRound {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, ".r{}", self.to_int())
|
||||
}
|
||||
}
|
||||
|
||||
/// Kepler only
|
||||
/// Surface Clamp
|
||||
///
|
||||
/// Can clamp coordinates of surface operations in a 0..=clamp inclusive
|
||||
/// range. It also computes other information useful to compute the
|
||||
/// real address of an element within an image for both block-lienar and
|
||||
/// pitch-linear layouts. We can also reduce this operation to a "stupid"
|
||||
/// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false
|
||||
/// this will not compute any extra operations and is useful to clamp array
|
||||
/// indexes.
|
||||
///
|
||||
/// Since the shader code does not know if an image layout is block-linear
|
||||
/// or pitch-linear, this opcode must be able to do both, the operation
|
||||
/// is then selected by the "clamp" bitfield, usually read from a descriptor.
|
||||
/// In block-linear mode we divide the bits that will compute the higher
|
||||
/// part and the lower part.
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
|
||||
pub struct OpSuClamp {
|
||||
#[dst_type(GPR)]
|
||||
pub dst: Dst,
|
||||
#[dst_type(Pred)]
|
||||
pub out_of_bounds: Dst,
|
||||
|
||||
/// This modifier specifies if we use pitch-linear or block-linear
|
||||
/// calculations, another option is to support both and read the actual
|
||||
/// format from the clamp (shader code doesn't always know if an image
|
||||
/// layout).
|
||||
/// When mode=pitch_linear and is_2d=false the suclamp op enters a
|
||||
/// simpler "plain" mode where it only performs clamping and the output
|
||||
/// register doesn't contain any information bits about pitch-linear or
|
||||
/// block-linear calculations
|
||||
pub mode: SuClampMode,
|
||||
/// Strangely enough, "round" just rounds the clamp, not the source
|
||||
/// this does not help at all with clamping coordinates.
|
||||
/// It could be useful when clamping raw addresses of a multi-byte read.
|
||||
/// ex: if we read 4 bytes at once, and the buffer length is 16,
|
||||
/// the bounds will be 15 (they are inclusive), but if we read
|
||||
/// at address 15 we would read bytes 15..19, so we are out of range.
|
||||
/// if we clamp tthe bounds to R4 the effective bound becomes 12
|
||||
/// so the read will be performed from 12..16, remaining in bounds.
|
||||
pub round: SuClampRound,
|
||||
pub is_s32: bool,
|
||||
pub is_2d: bool,
|
||||
|
||||
#[src_type(GPR)]
|
||||
pub coords: Src,
|
||||
|
||||
/// Packed parameter containing both bounds (inclusive)
|
||||
/// and other information (explained in more details in Foldable):
|
||||
/// 0..20: bound (inclusive)
|
||||
/// 21: pitch_linear (used if mode == StoredInDescriptor)
|
||||
/// 22..26: coord shl
|
||||
/// 26..29: coord shr
|
||||
/// 29..32: n. of tiles
|
||||
#[src_type(ALU)]
|
||||
pub params: Src,
|
||||
/// Added to the coords, it's only an i6
|
||||
pub imm: i8,
|
||||
}
|
||||
|
||||
impl Foldable for OpSuClamp {
|
||||
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
|
||||
let src = f.get_u32_src(self, &self.coords);
|
||||
let params = f.get_u32_src(self, &self.params);
|
||||
let imm = self.imm; // i6
|
||||
|
||||
let src = if self.is_s32 {
|
||||
(src as i32) as i64
|
||||
} else {
|
||||
src as i64
|
||||
};
|
||||
let src = src + (imm as i64);
|
||||
|
||||
let params_bv = BitView::new(¶ms);
|
||||
let pitch_linear = match self.mode {
|
||||
SuClampMode::StoredInDescriptor => params_bv.get_bit(21),
|
||||
SuClampMode::PitchLinear => true,
|
||||
SuClampMode::BlockLinear => false,
|
||||
};
|
||||
|
||||
let bounds = if pitch_linear && !self.is_2d {
|
||||
params
|
||||
} else {
|
||||
params_bv.get_bit_range_u64(0..20) as u32
|
||||
};
|
||||
|
||||
let bounds = bounds & self.round.to_mask();
|
||||
let (is_oob, clamped) = if src < 0 {
|
||||
(true, 0)
|
||||
} else if src > (bounds as i64) {
|
||||
(true, bounds)
|
||||
} else {
|
||||
(false, src as u32)
|
||||
};
|
||||
|
||||
let mut out = 0u32;
|
||||
let mut bv = BitMutView::new(&mut out);
|
||||
if pitch_linear {
|
||||
if !self.is_2d {
|
||||
// simple clamp mode, NO BITFIELD
|
||||
bv.set_field(0..32, clamped);
|
||||
} else {
|
||||
// Real, pitch_linear mode
|
||||
bv.set_field(0..20, clamped & 0xfffff);
|
||||
|
||||
// Pass through el_size_log2
|
||||
bv.set_field(27..30, params_bv.get_bit_range_u64(26..29));
|
||||
bv.set_bit(30, true); // pitch_linear=true
|
||||
bv.set_bit(31, is_oob);
|
||||
}
|
||||
} else {
|
||||
// Block linear
|
||||
|
||||
// Number of bits to discard for GoB coordinates
|
||||
let shr_a = params_bv.get_bit_range_u64(22..26) as u8;
|
||||
// Block coords
|
||||
bv.set_field(0..16, (clamped >> shr_a) & 0xffff);
|
||||
|
||||
// Shift applied to coords, always zero except for x.
|
||||
// (for coord x=1 and format R32, we want to access byte 4)
|
||||
// e.g. R8 -> 0, R32 -> 2, 128 -> 4
|
||||
let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8;
|
||||
// Coord inside GoB (element space)
|
||||
bv.set_field(16..24, (clamped << el_size_log2) & 0xff);
|
||||
|
||||
// Useful later to compute gob-space coords.
|
||||
let n_tiles = params_bv.get_bit_range_u64(29..32) as u8;
|
||||
bv.set_field(27..30, n_tiles);
|
||||
bv.set_bit(30, false); // pitch_linear=false
|
||||
bv.set_bit(31, is_oob);
|
||||
}
|
||||
f.set_u32_dst(self, &self.dst, out);
|
||||
f.set_pred_dst(self, &self.out_of_bounds, is_oob);
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayOp for OpSuClamp {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "suclamp{}", self.mode)?;
|
||||
if !matches!(self.round, SuClampRound::R1) {
|
||||
write!(f, "{}", self.round)?;
|
||||
}
|
||||
if !self.is_s32 {
|
||||
write!(f, ".u32")?;
|
||||
}
|
||||
if !self.is_2d {
|
||||
write!(f, ".1d")?;
|
||||
}
|
||||
|
||||
write! {f, " {} {} {:x}", self.coords, self.params, self.imm}
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpSuClamp);
|
||||
|
||||
/// Kepler only
|
||||
/// BitField Merge
|
||||
///
|
||||
/// The resulting bit-field is composed of a high-part 8..32 that is merged
|
||||
/// with the address by sueau, and a lower-part 0..8 that is provided
|
||||
/// directly to suldga/sustga and defines the lower offset of the glonal array.
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
|
||||
pub struct OpSuBfm {
|
||||
#[dst_type(GPR)]
|
||||
pub dst: Dst,
|
||||
#[dst_type(Pred)]
|
||||
pub pdst: Dst,
|
||||
|
||||
/// x, y, z
|
||||
#[src_type(ALU)]
|
||||
pub srcs: [Src; 3],
|
||||
/// When is_3d=false the third source is ignored, but still used in
|
||||
/// pitch-linear computation.
|
||||
pub is_3d: bool,
|
||||
}
|
||||
|
||||
impl Foldable for OpSuBfm {
|
||||
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
|
||||
let x_raw = f.get_u32_src(self, &self.srcs[0]);
|
||||
let y_raw = f.get_u32_src(self, &self.srcs[1]);
|
||||
let z_raw = f.get_u32_src(self, &self.srcs[2]);
|
||||
|
||||
let x = BitView::new(&x_raw);
|
||||
let y = BitView::new(&y_raw);
|
||||
let z = BitView::new(&z_raw);
|
||||
|
||||
let mut o_raw = 0u32;
|
||||
let mut o = BitMutView::new(&mut o_raw);
|
||||
|
||||
let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30);
|
||||
|
||||
if !is_pitch_linear_2d {
|
||||
// Copy coordinates inside of GoB space.
|
||||
// They are 6 bits from x and 3 from y (GoB is 64x8 bytes).
|
||||
// Bits from 0..8 are ignored by sueau and are used directly
|
||||
// by suldga/sustga.
|
||||
// Bit 9 will become the first bit of the higher part in
|
||||
// sueau.
|
||||
o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20));
|
||||
|
||||
// Address calculation inside of GoB should virtually be
|
||||
// y * 64 + x * element_size (each row is linear).
|
||||
// So why are those bits swizzled like so?
|
||||
// I have no idea, but these are correct even for atomics
|
||||
// that accept real addresses.
|
||||
o.set_bit(4, y.get_bit(16));
|
||||
o.set_bit(5, y.get_bit(17));
|
||||
o.set_bit(6, x.get_bit(20));
|
||||
o.set_bit(7, y.get_bit(18));
|
||||
|
||||
o.set_bit(8, x.get_bit(21));
|
||||
// 9..11: 0
|
||||
|
||||
// -------------- Tiles --------------
|
||||
// Number of tiles log2
|
||||
let ntx = x.get_bit_range_u64(27..30) & 0x1;
|
||||
let nty = y.get_bit_range_u64(27..30);
|
||||
let ntz = z.get_bit_range_u64(27..30);
|
||||
let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false
|
||||
|
||||
// Computes how many bits to dedicate to GoB coords inside
|
||||
// a block
|
||||
o.set_field(12..16, ntx + nty + ntz);
|
||||
|
||||
// Coords in gob_space.
|
||||
// Remove 6 bits from x and 3 bits from y, those are used
|
||||
// as element coords in GoB space.
|
||||
let a = x.get_bit_range_u64(22..24); // 1100_0000
|
||||
let b = y.get_bit_range_u64(19..24); // 1111_1000
|
||||
let c = z.get_bit_range_u64(16..24); // 1111_1111
|
||||
|
||||
// nt* indicates how many bits to consider (max 5)
|
||||
let a = a & ((1 << ntx) - 1);
|
||||
let b = b & ((1 << nty.min(5)) - 1);
|
||||
let c = c & ((1 << ntz.min(5)) - 1);
|
||||
|
||||
// Compute gob offset
|
||||
// We can just or together at certain offsets because
|
||||
// Tiles are always powers of two in each direction.
|
||||
// z || y || x (LSB)
|
||||
let res = c;
|
||||
let res = (res << nty) | b;
|
||||
let res = (res << ntx) | a;
|
||||
let mask = match ntx {
|
||||
0 => 0x3ff,
|
||||
_ => 0x7ff,
|
||||
};
|
||||
|
||||
// gob coords will be put before the block coords in
|
||||
// sueau.
|
||||
o.set_field(16..27, res & mask);
|
||||
} else {
|
||||
let d = z.get_bit_range_u64(0..8);
|
||||
let el_size_log2 = x.get_bit_range_u64(27..30);
|
||||
o.set_field(0..8, (d << el_size_log2) & 0xff);
|
||||
// 9..11: 0
|
||||
o.set_field(12..15, el_size_log2);
|
||||
}
|
||||
|
||||
o.set_bit(11, is_pitch_linear_2d);
|
||||
|
||||
let is_oob =
|
||||
x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d);
|
||||
f.set_u32_dst(self, &self.dst, o_raw);
|
||||
f.set_pred_dst(self, &self.pdst, is_oob);
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayOp for OpSuBfm {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "subfm")?;
|
||||
|
||||
if self.is_3d {
|
||||
write!(f, ".3d")?;
|
||||
}
|
||||
|
||||
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpSuBfm);
|
||||
|
||||
/// Kepler only
|
||||
/// Used to compute the higher 32 bits of image address using
|
||||
/// the merged bitfield and the block coordinates (offset).
|
||||
/// It can switch to a pitch_linear mode (bit 11 of bit-field).
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
|
||||
pub struct OpSuEau {
|
||||
#[dst_type(GPR)]
|
||||
pub dst: Dst,
|
||||
|
||||
/// offset is computed from the block coordinates.
|
||||
/// it's ok to add it directly to the address since they are both
|
||||
/// "aligned" to 64 (the first 8 bits are removed from both)
|
||||
#[src_type(GPR)]
|
||||
pub off: Src,
|
||||
|
||||
/// 8.. 9: offset, last bit
|
||||
/// 11..12: pitch_linear: when enabled the bf-offset is ignored and
|
||||
/// the off_shl is subtracted by 8
|
||||
/// 12..16: off_shl, shifts left the offset by off_shl + 1
|
||||
/// 16..27: 11-bit offset, when joined with the 1-bit offset completes the
|
||||
/// 12-bit offset ORed to the src offset after shifting
|
||||
/// (unless pitch_linear)
|
||||
#[src_type(ALU)]
|
||||
pub bit_field: Src,
|
||||
|
||||
#[src_type(GPR)]
|
||||
pub addr: Src,
|
||||
}
|
||||
|
||||
impl Foldable for OpSuEau {
|
||||
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
|
||||
let off_raw = f.get_u32_src(self, &self.off);
|
||||
let bf_raw = f.get_u32_src(self, &self.bit_field);
|
||||
let addr = f.get_u32_src(self, &self.addr);
|
||||
|
||||
let bf = BitView::new(&bf_raw);
|
||||
|
||||
let off1 = bf.get_bit_range_u64(8..9) as u32;
|
||||
let is_pitch_linear = bf.get_bit(11);
|
||||
let off_shift = bf.get_bit_range_u64(12..16) as u32;
|
||||
let offs = bf.get_bit_range_u64(16..27) as u32;
|
||||
|
||||
let res = if !is_pitch_linear {
|
||||
// Block linear
|
||||
// off_raw are the block coordinates
|
||||
// to those we add gob coordinates from the merged bitfield
|
||||
// and the MSB of in-gob coordinates.
|
||||
let omul = off_shift + 1;
|
||||
let real_off = (off_raw << omul) | (offs << 1) | off1;
|
||||
addr.wrapping_add(real_off & 0x7ff_ffff)
|
||||
} else {
|
||||
// Add the high part of the coordinates to addr
|
||||
// off << (omul - 8)
|
||||
// but for negative values do a shr instead.
|
||||
// In fact, off_shift will always be < 8 because pitch_linear
|
||||
// subfm only assigns bits 12..15, so this is always a shr
|
||||
let shl_amount = off_shift as i32 - 8;
|
||||
let off = if shl_amount < 0 {
|
||||
off_raw >> (-shl_amount as u32)
|
||||
} else {
|
||||
off_raw << (shl_amount as u32)
|
||||
};
|
||||
addr.wrapping_add(off & 0xff_ffff)
|
||||
};
|
||||
f.set_u32_dst(self, &self.dst, res);
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayOp for OpSuEau {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr}
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpSuEau);
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum IMadSpSrcType {
|
||||
U32,
|
||||
U24,
|
||||
U16Hi,
|
||||
U16Lo,
|
||||
S32,
|
||||
S24,
|
||||
S16Hi,
|
||||
S16Lo,
|
||||
}
|
||||
|
||||
impl IMadSpSrcType {
|
||||
pub fn unsigned(self) -> IMadSpSrcType {
|
||||
use IMadSpSrcType::*;
|
||||
match self {
|
||||
S32 => U32,
|
||||
S24 => U24,
|
||||
S16Hi => U16Hi,
|
||||
S16Lo => U16Lo,
|
||||
x => x,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // Used in hw_tests
|
||||
pub fn with_sign(self, sign: bool) -> Self {
|
||||
use IMadSpSrcType::*;
|
||||
if !sign {
|
||||
return self.unsigned();
|
||||
}
|
||||
match self {
|
||||
U32 => S32,
|
||||
U24 => S24,
|
||||
U16Hi => S16Hi,
|
||||
U16Lo => S16Lo,
|
||||
x => x,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sign(self) -> bool {
|
||||
use IMadSpSrcType::*;
|
||||
match self {
|
||||
U32 | U24 | U16Hi | U16Lo => false,
|
||||
S32 | S24 | S16Hi | S16Lo => true,
|
||||
}
|
||||
}
|
||||
|
||||
fn cast(&self, v: u32) -> i64 {
|
||||
use IMadSpSrcType::*;
|
||||
match self {
|
||||
U32 => v as i64,
|
||||
U24 => (v & 0x00ff_ffff) as i64,
|
||||
U16Lo => (v as u16) as i64,
|
||||
U16Hi => (v >> 16) as i64,
|
||||
S32 => (v as i32) as i64,
|
||||
S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend
|
||||
S16Lo => (v as i16) as i64,
|
||||
S16Hi => ((v >> 16) as i16) as i64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for IMadSpSrcType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let sign = if self.sign() { ".s" } else { ".u" };
|
||||
let width = match self.unsigned() {
|
||||
IMadSpSrcType::U32 => "32",
|
||||
IMadSpSrcType::U24 => "24",
|
||||
IMadSpSrcType::U16Lo => "16h0",
|
||||
IMadSpSrcType::U16Hi => "16h1",
|
||||
_ => unreachable!(),
|
||||
};
|
||||
write!(f, "{}{}", sign, width)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum IMadSpMode {
|
||||
Explicit([IMadSpSrcType; 3]),
|
||||
// Parameters are loaded from src1 bits 26..32
|
||||
FromSrc1,
|
||||
}
|
||||
|
||||
impl fmt::Display for IMadSpMode {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"),
|
||||
IMadSpMode::FromSrc1 => write!(f, ".sd"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Kepler only
|
||||
/// Extracted Integer Multiply and Add.
|
||||
/// It does the same operation as an imad op, but it can extract the
|
||||
/// sources from a subset of the register (only 32, 24 or 16 bits).
|
||||
/// It can also do a "load parameters" mode where the modifiers are
|
||||
/// loaded from the higher bits in src2 (check Foldable impl for details).
|
||||
/// Limits: src1 can never be U32 or U16Hi,
|
||||
/// src2 can never be U16Hi
|
||||
/// src2 signedness is tied to src1 and src0 signedness,
|
||||
/// if either is signed, src2 must be signed too.
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
|
||||
pub struct OpIMadSp {
|
||||
#[dst_type(GPR)]
|
||||
pub dst: Dst,
|
||||
|
||||
#[src_type(ALU)]
|
||||
pub srcs: [Src; 3],
|
||||
|
||||
pub mode: IMadSpMode,
|
||||
}
|
||||
|
||||
impl Foldable for OpIMadSp {
|
||||
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
|
||||
let src0 = f.get_u32_src(self, &self.srcs[0]);
|
||||
let src1 = f.get_u32_src(self, &self.srcs[1]);
|
||||
let src2 = f.get_u32_src(self, &self.srcs[2]);
|
||||
|
||||
let (src_type0, src_type1, src_type2) = match self.mode {
|
||||
IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2),
|
||||
IMadSpMode::FromSrc1 => {
|
||||
let params = BitView::new(&src1);
|
||||
|
||||
let st2 = params.get_bit_range_u64(26..28) as usize;
|
||||
let st1 = params.get_bit_range_u64(28..30) as usize;
|
||||
let st0 = params.get_bit_range_u64(30..32) as usize;
|
||||
|
||||
use IMadSpSrcType::*;
|
||||
let types0 = [U32, U24, U16Lo, U16Hi];
|
||||
let types1 = [U16Lo, U24, U16Lo, U24];
|
||||
let types2 = [U32, U24, U16Lo, U32];
|
||||
|
||||
(
|
||||
types0[st0].unsigned(),
|
||||
types1[st1].unsigned(),
|
||||
types2[st2].unsigned(),
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let src0 = src_type0.cast(src0);
|
||||
let src1 = src_type1.cast(src1);
|
||||
let src2 = src_type2.cast(src2);
|
||||
|
||||
f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32);
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayOp for OpIMadSp {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"imadsp{} {} {} {}",
|
||||
self.mode, self.srcs[0], self.srcs[1], self.srcs[2]
|
||||
)
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpIMadSp);
|
||||
|
||||
/// In SuGa ops, the address is always specified in two parts, the higher
|
||||
/// part contains the base address without the lower 8 bits (base_addr >> 8),
|
||||
/// while the lower part might contain either the missing 8 bits (U8) or
|
||||
/// a full 32-bit offset that must not be shifted (U32).
|
||||
///
|
||||
/// In short:
|
||||
/// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF)
|
||||
/// U32: real_address = (addr_hi << 8) + addr_lo
|
||||
/// The signed variants do the same but with sign extension probably
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum SuGaOffsetMode {
|
||||
U32,
|
||||
S32,
|
||||
U8,
|
||||
S8,
|
||||
}
|
||||
|
||||
/// Kepler only
|
||||
/// Load a pixel from an image, takes the pixel address and format as an
|
||||
/// argument. Since the image coordinates are not present, the instruction
|
||||
/// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1)
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice)]
|
||||
pub struct OpSuLdGa {
|
||||
pub dst: Dst,
|
||||
|
||||
pub mem_type: MemType,
|
||||
pub offset_mode: SuGaOffsetMode,
|
||||
|
||||
/// Format for the loaded data, passed directly from the descriptor.
|
||||
#[src_type(GPR)]
|
||||
pub format: Src,
|
||||
|
||||
/// This is not an address, but it's two registers that contain
|
||||
/// [addr >> 8, addr & 0xff].
|
||||
/// This works because addr >> 8 is 32-bits (GOB-aligned) and the
|
||||
/// rest 8-bits are extracted by the bit-field
|
||||
/// It's useful since in block-linear mode the lower bits and the higher
|
||||
/// bits are computed in different ways.
|
||||
#[src_type(SSA)]
|
||||
pub addr: Src,
|
||||
|
||||
#[src_type(Pred)]
|
||||
pub out_of_bounds: Src,
|
||||
}
|
||||
|
||||
impl DisplayOp for OpSuLdGa {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"suldga{} [{}] {} {}",
|
||||
self.mem_type, self.addr, self.format, self.out_of_bounds
|
||||
)
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpSuLdGa);
|
||||
|
||||
/// Kepler only
|
||||
/// Store a pixel in an image, takes the pixel address and format as an
|
||||
/// argument. Since the image coordinates are not present, the instruction
|
||||
/// also needs an `out_of_bounds` predicate, when true, stores are ingored
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice)]
|
||||
pub struct OpSuStGa {
|
||||
pub image_access: ImageAccess,
|
||||
pub offset_mode: SuGaOffsetMode,
|
||||
|
||||
#[src_type(GPR)]
|
||||
pub format: Src,
|
||||
|
||||
#[src_type(SSA)]
|
||||
pub addr: Src,
|
||||
|
||||
#[src_type(SSA)]
|
||||
pub data: Src,
|
||||
|
||||
#[src_type(Pred)]
|
||||
pub out_of_bounds: Src,
|
||||
}
|
||||
|
||||
impl DisplayOp for OpSuStGa {
|
||||
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"sustga{} [{}] {} {} {}",
|
||||
self.image_access,
|
||||
self.addr,
|
||||
self.format,
|
||||
self.data,
|
||||
self.out_of_bounds,
|
||||
)
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpSuStGa);
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(SrcsAsSlice, DstsAsSlice)]
|
||||
pub struct OpLd {
|
||||
|
|
@ -6676,6 +7339,12 @@ pub enum Op {
|
|||
SuLd(OpSuLd),
|
||||
SuSt(OpSuSt),
|
||||
SuAtom(OpSuAtom),
|
||||
SuClamp(OpSuClamp),
|
||||
SuBfm(OpSuBfm),
|
||||
SuEau(OpSuEau),
|
||||
IMadSp(OpIMadSp),
|
||||
SuLdGa(OpSuLdGa),
|
||||
SuStGa(OpSuStGa),
|
||||
Ld(OpLd),
|
||||
Ldc(OpLdc),
|
||||
LdSharedLock(OpLdSharedLock),
|
||||
|
|
@ -6805,6 +7474,10 @@ impl Op {
|
|||
| Op::LeaX(_)
|
||||
| Op::Lop2(_)
|
||||
| Op::Lop3(_)
|
||||
| Op::SuClamp(_)
|
||||
| Op::SuBfm(_)
|
||||
| Op::SuEau(_)
|
||||
| Op::IMadSp(_)
|
||||
| Op::Shf(_)
|
||||
| Op::Shl(_)
|
||||
| Op::Shr(_)
|
||||
|
|
@ -6834,7 +7507,11 @@ impl Op {
|
|||
| Op::Txq(_) => false,
|
||||
|
||||
// Surface ops
|
||||
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => false,
|
||||
Op::SuLd(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuAtom(_)
|
||||
| Op::SuLdGa(_)
|
||||
| Op::SuStGa(_) => false,
|
||||
|
||||
// Memory ops
|
||||
Op::Ld(_)
|
||||
|
|
@ -7232,7 +7909,11 @@ impl Instr {
|
|||
Op::Atom(op) => op.mem_space != MemSpace::Local,
|
||||
Op::Ld(op) => op.access.space != MemSpace::Local,
|
||||
Op::St(op) => op.access.space != MemSpace::Local,
|
||||
Op::SuAtom(_) | Op::SuLd(_) | Op::SuSt(_) => true,
|
||||
Op::SuAtom(_)
|
||||
| Op::SuLd(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuLdGa(_)
|
||||
| Op::SuStGa(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
|
@ -7241,7 +7922,7 @@ impl Instr {
|
|||
match &self.op {
|
||||
Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)),
|
||||
Op::St(op) => matches!(op.access.space, MemSpace::Global(_)),
|
||||
Op::SuAtom(_) | Op::SuSt(_) => true,
|
||||
Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
|
@ -7250,6 +7931,7 @@ impl Instr {
|
|||
match &self.op {
|
||||
Op::ASt(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuStGa(_)
|
||||
| Op::SuAtom(_)
|
||||
| Op::LdSharedLock(_)
|
||||
| Op::St(_)
|
||||
|
|
|
|||
|
|
@ -129,6 +129,10 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
|
|||
| Op::LeaX(_)
|
||||
| Op::Lop2(_)
|
||||
| Op::Lop3(_)
|
||||
| Op::SuClamp(_)
|
||||
| Op::SuBfm(_)
|
||||
| Op::SuEau(_)
|
||||
| Op::IMadSp(_)
|
||||
| Op::Shf(_)
|
||||
| Op::Shl(_)
|
||||
| Op::Shr(_)
|
||||
|
|
@ -158,7 +162,11 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
|
|||
| Op::Txq(_) => SideEffect::Memory,
|
||||
|
||||
// Surface ops
|
||||
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory,
|
||||
Op::SuLd(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuAtom(_)
|
||||
| Op::SuLdGa(_)
|
||||
| Op::SuStGa(_) => SideEffect::Memory,
|
||||
|
||||
// Memory ops
|
||||
Op::Ipa(_) | Op::Ldc(_) => SideEffect::None,
|
||||
|
|
@ -262,7 +270,11 @@ pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 {
|
|||
| Op::Txq(_) => 32,
|
||||
|
||||
// Surface ops
|
||||
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32,
|
||||
Op::SuLd(_)
|
||||
| Op::SuSt(_)
|
||||
| Op::SuAtom(_)
|
||||
| Op::SuLdGa(_)
|
||||
| Op::SuStGa(_) => 32,
|
||||
|
||||
// Memory ops
|
||||
Op::Ldc(_) => 4,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue