nak: Add surface address ops

Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34975>
This commit is contained in:
Lorenzo Rossi 2025-05-23 16:01:32 +02:00 committed by Marge Bot
parent 82d789d22a
commit ac3fd5768b
4 changed files with 996 additions and 6 deletions

View file

@ -2534,6 +2534,155 @@ impl<'a> ShaderFromNir<'a> {
});
self.set_dst(&intrin.def, dst);
}
nir_intrinsic_suclamp_nv => {
let coords = self.get_src(&srcs[0]);
let params = self.get_src(&srcs[1]);
let flags = intrin.flags();
let flags: nak_nir_suclamp_flags =
unsafe { std::mem::transmute_copy(&flags) };
let mode = match flags.mode() {
NAK_SUCLAMP_MODE_BLOCK_LINEAR => SuClampMode::BlockLinear,
NAK_SUCLAMP_MODE_PITCH_LINEAR => SuClampMode::PitchLinear,
NAK_SUCLAMP_MODE_STORED_DESCRIPTOR => {
SuClampMode::StoredInDescriptor
}
_ => panic!("Invalid suclamp mode"),
};
let round = match flags.round() {
NAK_SUCLAMP_ROUND_R1 => SuClampRound::R1,
NAK_SUCLAMP_ROUND_R2 => SuClampRound::R2,
NAK_SUCLAMP_ROUND_R4 => SuClampRound::R4,
NAK_SUCLAMP_ROUND_R8 => SuClampRound::R8,
NAK_SUCLAMP_ROUND_R16 => SuClampRound::R16,
_ => panic!("Invalid suclamp round"),
};
let dst = b.alloc_ssa(RegFile::GPR);
let out_of_bounds = b.alloc_ssa(RegFile::Pred);
b.push_op(OpSuClamp {
dst: dst.into(),
out_of_bounds: out_of_bounds.into(),
coords,
params,
mode,
round,
is_2d: flags.is_2d(),
is_s32: flags.is_s32(),
imm: 0,
});
let final_dst =
vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
self.set_ssa(&intrin.def, final_dst);
}
nir_intrinsic_subfm_nv => {
let x = self.get_src(&srcs[0]);
let y = self.get_src(&srcs[1]);
let z = self.get_src(&srcs[2]);
let is_3d = intrin.flags() != 0;
let dst = b.alloc_ssa(RegFile::GPR);
let out_of_bounds = b.alloc_ssa(RegFile::Pred);
b.push_op(OpSuBfm {
dst: dst.into(),
pdst: out_of_bounds.into(),
srcs: [x, y, z],
is_3d,
});
let final_dst =
vec![dst, b.sel(out_of_bounds.into(), 1.into(), 0.into())];
self.set_ssa(&intrin.def, final_dst);
}
nir_intrinsic_sueau_nv => {
let off = self.get_src(&srcs[0]);
let bit_field = self.get_src(&srcs[1]);
let addr = self.get_src(&srcs[2]);
let dst = b.alloc_ssa(RegFile::GPR);
b.push_op(OpSuEau {
dst: dst.into(),
off,
bit_field,
addr,
});
self.set_dst(&intrin.def, dst.into());
}
nir_intrinsic_imadsp_nv => {
let src0 = self.get_src(&srcs[0]);
let src1 = self.get_src(&srcs[1]);
let src2 = self.get_src(&srcs[2]);
let flags = intrin.flags();
let flags: nak_nir_imadsp_flags =
unsafe { std::mem::transmute_copy(&flags) };
let translate_src_type = |s| {
use IMadSpSrcType::*;
match s {
NAK_IMAD_TYPE_U32 => U32,
NAK_IMAD_TYPE_U24 => U24,
NAK_IMAD_TYPE_U16_LO => U16Lo,
NAK_IMAD_TYPE_U16_HI => U16Hi,
NAK_IMAD_TYPE_S32 => S32,
NAK_IMAD_TYPE_S24 => S24,
NAK_IMAD_TYPE_S16_LO => S16Lo,
NAK_IMAD_TYPE_S16_HI => S16Hi,
_ => panic!("Invalid imadsp mode"),
}
};
let mode = if flags.params_from_src1() {
IMadSpMode::FromSrc1
} else {
IMadSpMode::Explicit([
translate_src_type(flags.src0()),
translate_src_type(flags.src1()),
translate_src_type(flags.src2()),
])
};
let dst = b.alloc_ssa(RegFile::GPR);
b.push_op(OpIMadSp {
srcs: [src0, src1, src2],
dst: dst.into(),
mode,
});
self.set_dst(&intrin.def, dst.into());
}
nir_intrinsic_suldga_nv => {
let addr = self.get_src(&srcs[0]);
let format = self.get_src(&srcs[1]);
let out_of_bounds = self.get_src(&srcs[2]);
let comps = intrin.num_components;
assert!(intrin.def.bit_size() == 32);
let mem_type = self.get_image_mem_type(intrin);
let flags = intrin.flags();
let offset_mode = match flags {
NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
_ => panic!("Invalid suldga flags"),
};
let dst = b.alloc_ssa_vec(RegFile::GPR, comps);
b.push_op(OpSuLdGa {
dst: dst.clone().into(),
addr,
format,
out_of_bounds,
mem_type,
offset_mode,
});
self.set_dst(&intrin.def, dst);
}
nir_intrinsic_bindless_image_load
| nir_intrinsic_bindless_image_load_raw_nv => {
let handle = self.get_src(&srcs[0]);
@ -2624,6 +2773,33 @@ impl<'a> ShaderFromNir<'a> {
self.set_ssa(&intrin.def, final_dst);
}
nir_intrinsic_sustga_nv => {
let addr = self.get_src(&srcs[0]);
let format = self.get_src(&srcs[1]);
let out_of_bounds = self.get_src(&srcs[2]);
let data = self.get_src(&srcs[3]);
let image_access =
ImageAccess::Formatted(ChannelMask::new(0xf));
let flags = intrin.flags();
let offset_mode = match flags {
NAK_SUGA_OFF_MODE_U32 => SuGaOffsetMode::U32,
NAK_SUGA_OFF_MODE_S32 => SuGaOffsetMode::S32,
NAK_SUGA_OFF_MODE_U8 => SuGaOffsetMode::U8,
NAK_SUGA_OFF_MODE_S8 => SuGaOffsetMode::S8,
_ => panic!("Invalid sustga flags"),
};
b.push_op(OpSuStGa {
addr,
format,
data,
out_of_bounds,
image_access,
offset_mode,
});
}
nir_intrinsic_bindless_image_store => {
let handle = self.get_src(&srcs[0]);
let dim = self.get_image_dim(intrin);

View file

@ -1180,6 +1180,126 @@ fn test_iadd64() {
}
}
#[test]
fn test_op_suclamp() {
if !RunSingleton::get().sm.is_kepler() {
return;
}
// We cannot test every single combination of options.
// Use a random generator for rounding and immediate
let mut a = Acorn::new();
for mode in [
SuClampMode::StoredInDescriptor,
SuClampMode::PitchLinear,
SuClampMode::BlockLinear,
] {
for i in 0..4 {
let is_s32 = (i & (1 << 0)) != 0;
let is_2d = (i & (1 << 1)) != 0;
// immediate is an i6 value
let imm = (a.get_u32() % 64) as i8 - 32;
let round = match a.get_u32() % 5 {
0 => SuClampRound::R1,
1 => SuClampRound::R2,
2 => SuClampRound::R4,
3 => SuClampRound::R8,
_ => SuClampRound::R16,
};
let op = OpSuClamp {
dst: Dst::None,
out_of_bounds: Dst::None,
mode,
round,
is_s32,
is_2d,
coords: 0.into(),
params: 0.into(),
imm,
};
test_foldable_op(op);
}
}
}
#[test]
fn test_op_subfm() {
if !RunSingleton::get().sm.is_kepler() {
return;
}
for is_3d in [false, true] {
let op = OpSuBfm {
dst: Dst::None,
pdst: Dst::None,
srcs: [0.into(), 0.into(), 0.into()],
is_3d,
};
test_foldable_op(op);
}
}
#[test]
fn test_op_sueau() {
if !RunSingleton::get().sm.is_kepler() {
return;
}
let op = OpSuEau {
dst: Dst::None,
off: 0.into(),
bit_field: 0.into(),
addr: 0.into(),
};
test_foldable_op(op);
}
#[test]
fn test_op_imadsp() {
if !RunSingleton::get().sm.is_kepler() {
return;
}
use IMadSpSrcType::*;
let src0_w = [U32, U24, U16Lo, U16Hi];
let src1_w = [U24, U16Lo];
let src2_w = [U32, U24, U16Lo];
let mut modes = vec![];
// Cartesian product
for w0 in src0_w {
for w1 in src1_w {
for w2 in src2_w {
for sign in 0..4 {
let s0 = (sign & 0x1) != 0;
let s1 = (sign & 0x2) != 0;
let s2 = s0 || s1;
modes.push(IMadSpMode::Explicit([
w0.with_sign(s0),
w1.with_sign(s1),
w2.with_sign(s2),
]))
}
}
}
}
modes.push(IMadSpMode::FromSrc1);
for mode in modes {
let op = OpIMadSp {
dst: Dst::None,
srcs: [0.into(), 0.into(), 0.into()],
mode,
};
test_foldable_op(op);
}
}
#[test]
fn test_ineg64() {
let run = RunSingleton::get();

View file

@ -4,7 +4,7 @@
extern crate bitview;
extern crate nak_ir_proc;
use bitview::{BitMutView, BitView};
use bitview::{BitMutView, BitMutViewable, BitView, BitViewable, SetField};
use nak_bindings::*;
pub use crate::builder::{Builder, InstrBuilder, SSABuilder, SSAInstrBuilder};
@ -5183,6 +5183,669 @@ impl DisplayOp for OpSuAtom {
}
impl_display_for_op!(OpSuAtom);
#[derive(Clone, Copy)]
pub enum SuClampMode {
StoredInDescriptor,
PitchLinear,
BlockLinear,
}
impl fmt::Display for SuClampMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
SuClampMode::StoredInDescriptor => ".sd",
SuClampMode::PitchLinear => ".pl",
SuClampMode::BlockLinear => ".bl",
};
write!(f, "{}", s)
}
}
#[derive(Clone, Copy)]
pub enum SuClampRound {
R1,
R2,
R4,
R8,
R16,
}
impl SuClampRound {
pub fn to_int(&self) -> u8 {
match self {
SuClampRound::R1 => 1,
SuClampRound::R2 => 2,
SuClampRound::R4 => 4,
SuClampRound::R8 => 8,
SuClampRound::R16 => 16,
}
}
pub fn to_mask(&self) -> u32 {
!(self.to_int() as u32 - 1)
}
}
impl fmt::Display for SuClampRound {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, ".r{}", self.to_int())
}
}
/// Kepler only
/// Surface Clamp
///
/// Can clamp coordinates of surface operations in a 0..=clamp inclusive
/// range. It also computes other information useful to compute the
/// real address of an element within an image for both block-lienar and
/// pitch-linear layouts. We can also reduce this operation to a "stupid"
/// inclusive clamp by setting modifier Mode=PitchLinear and is_2d=false
/// this will not compute any extra operations and is useful to clamp array
/// indexes.
///
/// Since the shader code does not know if an image layout is block-linear
/// or pitch-linear, this opcode must be able to do both, the operation
/// is then selected by the "clamp" bitfield, usually read from a descriptor.
/// In block-linear mode we divide the bits that will compute the higher
/// part and the lower part.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuClamp {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub out_of_bounds: Dst,
/// This modifier specifies if we use pitch-linear or block-linear
/// calculations, another option is to support both and read the actual
/// format from the clamp (shader code doesn't always know if an image
/// layout).
/// When mode=pitch_linear and is_2d=false the suclamp op enters a
/// simpler "plain" mode where it only performs clamping and the output
/// register doesn't contain any information bits about pitch-linear or
/// block-linear calculations
pub mode: SuClampMode,
/// Strangely enough, "round" just rounds the clamp, not the source
/// this does not help at all with clamping coordinates.
/// It could be useful when clamping raw addresses of a multi-byte read.
/// ex: if we read 4 bytes at once, and the buffer length is 16,
/// the bounds will be 15 (they are inclusive), but if we read
/// at address 15 we would read bytes 15..19, so we are out of range.
/// if we clamp tthe bounds to R4 the effective bound becomes 12
/// so the read will be performed from 12..16, remaining in bounds.
pub round: SuClampRound,
pub is_s32: bool,
pub is_2d: bool,
#[src_type(GPR)]
pub coords: Src,
/// Packed parameter containing both bounds (inclusive)
/// and other information (explained in more details in Foldable):
/// 0..20: bound (inclusive)
/// 21: pitch_linear (used if mode == StoredInDescriptor)
/// 22..26: coord shl
/// 26..29: coord shr
/// 29..32: n. of tiles
#[src_type(ALU)]
pub params: Src,
/// Added to the coords, it's only an i6
pub imm: i8,
}
impl Foldable for OpSuClamp {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src = f.get_u32_src(self, &self.coords);
let params = f.get_u32_src(self, &self.params);
let imm = self.imm; // i6
let src = if self.is_s32 {
(src as i32) as i64
} else {
src as i64
};
let src = src + (imm as i64);
let params_bv = BitView::new(&params);
let pitch_linear = match self.mode {
SuClampMode::StoredInDescriptor => params_bv.get_bit(21),
SuClampMode::PitchLinear => true,
SuClampMode::BlockLinear => false,
};
let bounds = if pitch_linear && !self.is_2d {
params
} else {
params_bv.get_bit_range_u64(0..20) as u32
};
let bounds = bounds & self.round.to_mask();
let (is_oob, clamped) = if src < 0 {
(true, 0)
} else if src > (bounds as i64) {
(true, bounds)
} else {
(false, src as u32)
};
let mut out = 0u32;
let mut bv = BitMutView::new(&mut out);
if pitch_linear {
if !self.is_2d {
// simple clamp mode, NO BITFIELD
bv.set_field(0..32, clamped);
} else {
// Real, pitch_linear mode
bv.set_field(0..20, clamped & 0xfffff);
// Pass through el_size_log2
bv.set_field(27..30, params_bv.get_bit_range_u64(26..29));
bv.set_bit(30, true); // pitch_linear=true
bv.set_bit(31, is_oob);
}
} else {
// Block linear
// Number of bits to discard for GoB coordinates
let shr_a = params_bv.get_bit_range_u64(22..26) as u8;
// Block coords
bv.set_field(0..16, (clamped >> shr_a) & 0xffff);
// Shift applied to coords, always zero except for x.
// (for coord x=1 and format R32, we want to access byte 4)
// e.g. R8 -> 0, R32 -> 2, 128 -> 4
let el_size_log2 = params_bv.get_bit_range_u64(26..29) as u8;
// Coord inside GoB (element space)
bv.set_field(16..24, (clamped << el_size_log2) & 0xff);
// Useful later to compute gob-space coords.
let n_tiles = params_bv.get_bit_range_u64(29..32) as u8;
bv.set_field(27..30, n_tiles);
bv.set_bit(30, false); // pitch_linear=false
bv.set_bit(31, is_oob);
}
f.set_u32_dst(self, &self.dst, out);
f.set_pred_dst(self, &self.out_of_bounds, is_oob);
}
}
impl DisplayOp for OpSuClamp {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "suclamp{}", self.mode)?;
if !matches!(self.round, SuClampRound::R1) {
write!(f, "{}", self.round)?;
}
if !self.is_s32 {
write!(f, ".u32")?;
}
if !self.is_2d {
write!(f, ".1d")?;
}
write! {f, " {} {} {:x}", self.coords, self.params, self.imm}
}
}
impl_display_for_op!(OpSuClamp);
/// Kepler only
/// BitField Merge
///
/// The resulting bit-field is composed of a high-part 8..32 that is merged
/// with the address by sueau, and a lower-part 0..8 that is provided
/// directly to suldga/sustga and defines the lower offset of the glonal array.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuBfm {
#[dst_type(GPR)]
pub dst: Dst,
#[dst_type(Pred)]
pub pdst: Dst,
/// x, y, z
#[src_type(ALU)]
pub srcs: [Src; 3],
/// When is_3d=false the third source is ignored, but still used in
/// pitch-linear computation.
pub is_3d: bool,
}
impl Foldable for OpSuBfm {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let x_raw = f.get_u32_src(self, &self.srcs[0]);
let y_raw = f.get_u32_src(self, &self.srcs[1]);
let z_raw = f.get_u32_src(self, &self.srcs[2]);
let x = BitView::new(&x_raw);
let y = BitView::new(&y_raw);
let z = BitView::new(&z_raw);
let mut o_raw = 0u32;
let mut o = BitMutView::new(&mut o_raw);
let is_pitch_linear_2d = x.get_bit(30) || y.get_bit(30);
if !is_pitch_linear_2d {
// Copy coordinates inside of GoB space.
// They are 6 bits from x and 3 from y (GoB is 64x8 bytes).
// Bits from 0..8 are ignored by sueau and are used directly
// by suldga/sustga.
// Bit 9 will become the first bit of the higher part in
// sueau.
o.set_bit_range_u64(0..4, x.get_bit_range_u64(16..20));
// Address calculation inside of GoB should virtually be
// y * 64 + x * element_size (each row is linear).
// So why are those bits swizzled like so?
// I have no idea, but these are correct even for atomics
// that accept real addresses.
o.set_bit(4, y.get_bit(16));
o.set_bit(5, y.get_bit(17));
o.set_bit(6, x.get_bit(20));
o.set_bit(7, y.get_bit(18));
o.set_bit(8, x.get_bit(21));
// 9..11: 0
// -------------- Tiles --------------
// Number of tiles log2
let ntx = x.get_bit_range_u64(27..30) & 0x1;
let nty = y.get_bit_range_u64(27..30);
let ntz = z.get_bit_range_u64(27..30);
let ntz = ntz * (self.is_3d as u64); // z is ignored if is_3d=false
// Computes how many bits to dedicate to GoB coords inside
// a block
o.set_field(12..16, ntx + nty + ntz);
// Coords in gob_space.
// Remove 6 bits from x and 3 bits from y, those are used
// as element coords in GoB space.
let a = x.get_bit_range_u64(22..24); // 1100_0000
let b = y.get_bit_range_u64(19..24); // 1111_1000
let c = z.get_bit_range_u64(16..24); // 1111_1111
// nt* indicates how many bits to consider (max 5)
let a = a & ((1 << ntx) - 1);
let b = b & ((1 << nty.min(5)) - 1);
let c = c & ((1 << ntz.min(5)) - 1);
// Compute gob offset
// We can just or together at certain offsets because
// Tiles are always powers of two in each direction.
// z || y || x (LSB)
let res = c;
let res = (res << nty) | b;
let res = (res << ntx) | a;
let mask = match ntx {
0 => 0x3ff,
_ => 0x7ff,
};
// gob coords will be put before the block coords in
// sueau.
o.set_field(16..27, res & mask);
} else {
let d = z.get_bit_range_u64(0..8);
let el_size_log2 = x.get_bit_range_u64(27..30);
o.set_field(0..8, (d << el_size_log2) & 0xff);
// 9..11: 0
o.set_field(12..15, el_size_log2);
}
o.set_bit(11, is_pitch_linear_2d);
let is_oob =
x.get_bit(31) || y.get_bit(31) || (z.get_bit(31) && self.is_3d);
f.set_u32_dst(self, &self.dst, o_raw);
f.set_pred_dst(self, &self.pdst, is_oob);
}
}
impl DisplayOp for OpSuBfm {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "subfm")?;
if self.is_3d {
write!(f, ".3d")?;
}
write!(f, " {} {} {}", self.srcs[0], self.srcs[1], self.srcs[2])
}
}
impl_display_for_op!(OpSuBfm);
/// Kepler only
/// Used to compute the higher 32 bits of image address using
/// the merged bitfield and the block coordinates (offset).
/// It can switch to a pitch_linear mode (bit 11 of bit-field).
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpSuEau {
#[dst_type(GPR)]
pub dst: Dst,
/// offset is computed from the block coordinates.
/// it's ok to add it directly to the address since they are both
/// "aligned" to 64 (the first 8 bits are removed from both)
#[src_type(GPR)]
pub off: Src,
/// 8.. 9: offset, last bit
/// 11..12: pitch_linear: when enabled the bf-offset is ignored and
/// the off_shl is subtracted by 8
/// 12..16: off_shl, shifts left the offset by off_shl + 1
/// 16..27: 11-bit offset, when joined with the 1-bit offset completes the
/// 12-bit offset ORed to the src offset after shifting
/// (unless pitch_linear)
#[src_type(ALU)]
pub bit_field: Src,
#[src_type(GPR)]
pub addr: Src,
}
impl Foldable for OpSuEau {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let off_raw = f.get_u32_src(self, &self.off);
let bf_raw = f.get_u32_src(self, &self.bit_field);
let addr = f.get_u32_src(self, &self.addr);
let bf = BitView::new(&bf_raw);
let off1 = bf.get_bit_range_u64(8..9) as u32;
let is_pitch_linear = bf.get_bit(11);
let off_shift = bf.get_bit_range_u64(12..16) as u32;
let offs = bf.get_bit_range_u64(16..27) as u32;
let res = if !is_pitch_linear {
// Block linear
// off_raw are the block coordinates
// to those we add gob coordinates from the merged bitfield
// and the MSB of in-gob coordinates.
let omul = off_shift + 1;
let real_off = (off_raw << omul) | (offs << 1) | off1;
addr.wrapping_add(real_off & 0x7ff_ffff)
} else {
// Add the high part of the coordinates to addr
// off << (omul - 8)
// but for negative values do a shr instead.
// In fact, off_shift will always be < 8 because pitch_linear
// subfm only assigns bits 12..15, so this is always a shr
let shl_amount = off_shift as i32 - 8;
let off = if shl_amount < 0 {
off_raw >> (-shl_amount as u32)
} else {
off_raw << (shl_amount as u32)
};
addr.wrapping_add(off & 0xff_ffff)
};
f.set_u32_dst(self, &self.dst, res);
}
}
impl DisplayOp for OpSuEau {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write! {f, "sueau {} {} {}", self.off, self.bit_field, self.addr}
}
}
impl_display_for_op!(OpSuEau);
#[derive(Copy, Clone, Debug)]
pub enum IMadSpSrcType {
U32,
U24,
U16Hi,
U16Lo,
S32,
S24,
S16Hi,
S16Lo,
}
impl IMadSpSrcType {
pub fn unsigned(self) -> IMadSpSrcType {
use IMadSpSrcType::*;
match self {
S32 => U32,
S24 => U24,
S16Hi => U16Hi,
S16Lo => U16Lo,
x => x,
}
}
#[allow(dead_code)] // Used in hw_tests
pub fn with_sign(self, sign: bool) -> Self {
use IMadSpSrcType::*;
if !sign {
return self.unsigned();
}
match self {
U32 => S32,
U24 => S24,
U16Hi => S16Hi,
U16Lo => S16Lo,
x => x,
}
}
pub fn sign(self) -> bool {
use IMadSpSrcType::*;
match self {
U32 | U24 | U16Hi | U16Lo => false,
S32 | S24 | S16Hi | S16Lo => true,
}
}
fn cast(&self, v: u32) -> i64 {
use IMadSpSrcType::*;
match self {
U32 => v as i64,
U24 => (v & 0x00ff_ffff) as i64,
U16Lo => (v as u16) as i64,
U16Hi => (v >> 16) as i64,
S32 => (v as i32) as i64,
S24 => (((v as i32) << 8) >> 8) as i64, // Sign extend
S16Lo => (v as i16) as i64,
S16Hi => ((v >> 16) as i16) as i64,
}
}
}
impl fmt::Display for IMadSpSrcType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let sign = if self.sign() { ".s" } else { ".u" };
let width = match self.unsigned() {
IMadSpSrcType::U32 => "32",
IMadSpSrcType::U24 => "24",
IMadSpSrcType::U16Lo => "16h0",
IMadSpSrcType::U16Hi => "16h1",
_ => unreachable!(),
};
write!(f, "{}{}", sign, width)
}
}
#[derive(Clone, Copy, Debug)]
pub enum IMadSpMode {
Explicit([IMadSpSrcType; 3]),
// Parameters are loaded from src1 bits 26..32
FromSrc1,
}
impl fmt::Display for IMadSpMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IMadSpMode::Explicit([a, b, c]) => write!(f, "{a}{b}{c}"),
IMadSpMode::FromSrc1 => write!(f, ".sd"),
}
}
}
/// Kepler only
/// Extracted Integer Multiply and Add.
/// It does the same operation as an imad op, but it can extract the
/// sources from a subset of the register (only 32, 24 or 16 bits).
/// It can also do a "load parameters" mode where the modifiers are
/// loaded from the higher bits in src2 (check Foldable impl for details).
/// Limits: src1 can never be U32 or U16Hi,
/// src2 can never be U16Hi
/// src2 signedness is tied to src1 and src0 signedness,
/// if either is signed, src2 must be signed too.
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice, Clone)]
pub struct OpIMadSp {
#[dst_type(GPR)]
pub dst: Dst,
#[src_type(ALU)]
pub srcs: [Src; 3],
pub mode: IMadSpMode,
}
impl Foldable for OpIMadSp {
fn fold(&self, _sm: &dyn ShaderModel, f: &mut OpFoldData<'_>) {
let src0 = f.get_u32_src(self, &self.srcs[0]);
let src1 = f.get_u32_src(self, &self.srcs[1]);
let src2 = f.get_u32_src(self, &self.srcs[2]);
let (src_type0, src_type1, src_type2) = match self.mode {
IMadSpMode::Explicit([t0, t1, t2]) => (t0, t1, t2),
IMadSpMode::FromSrc1 => {
let params = BitView::new(&src1);
let st2 = params.get_bit_range_u64(26..28) as usize;
let st1 = params.get_bit_range_u64(28..30) as usize;
let st0 = params.get_bit_range_u64(30..32) as usize;
use IMadSpSrcType::*;
let types0 = [U32, U24, U16Lo, U16Hi];
let types1 = [U16Lo, U24, U16Lo, U24];
let types2 = [U32, U24, U16Lo, U32];
(
types0[st0].unsigned(),
types1[st1].unsigned(),
types2[st2].unsigned(),
)
}
};
let src0 = src_type0.cast(src0);
let src1 = src_type1.cast(src1);
let src2 = src_type2.cast(src2);
f.set_u32_dst(self, &self.dst, (src0 * src1 + src2) as u32);
}
}
impl DisplayOp for OpIMadSp {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"imadsp{} {} {} {}",
self.mode, self.srcs[0], self.srcs[1], self.srcs[2]
)
}
}
impl_display_for_op!(OpIMadSp);
/// In SuGa ops, the address is always specified in two parts, the higher
/// part contains the base address without the lower 8 bits (base_addr >> 8),
/// while the lower part might contain either the missing 8 bits (U8) or
/// a full 32-bit offset that must not be shifted (U32).
///
/// In short:
/// U8 : real_address = (addr_hi << 8) + (addr_lo & 0xFF)
/// U32: real_address = (addr_hi << 8) + addr_lo
/// The signed variants do the same but with sign extension probably
#[derive(Clone, Copy)]
pub enum SuGaOffsetMode {
U32,
S32,
U8,
S8,
}
/// Kepler only
/// Load a pixel from an image, takes the pixel address and format as an
/// argument. Since the image coordinates are not present, the instruction
/// also needs an `out_of_bounds` predicate, when true it always load (0, 0, 0, 1)
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuLdGa {
pub dst: Dst,
pub mem_type: MemType,
pub offset_mode: SuGaOffsetMode,
/// Format for the loaded data, passed directly from the descriptor.
#[src_type(GPR)]
pub format: Src,
/// This is not an address, but it's two registers that contain
/// [addr >> 8, addr & 0xff].
/// This works because addr >> 8 is 32-bits (GOB-aligned) and the
/// rest 8-bits are extracted by the bit-field
/// It's useful since in block-linear mode the lower bits and the higher
/// bits are computed in different ways.
#[src_type(SSA)]
pub addr: Src,
#[src_type(Pred)]
pub out_of_bounds: Src,
}
impl DisplayOp for OpSuLdGa {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"suldga{} [{}] {} {}",
self.mem_type, self.addr, self.format, self.out_of_bounds
)
}
}
impl_display_for_op!(OpSuLdGa);
/// Kepler only
/// Store a pixel in an image, takes the pixel address and format as an
/// argument. Since the image coordinates are not present, the instruction
/// also needs an `out_of_bounds` predicate, when true, stores are ingored
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpSuStGa {
pub image_access: ImageAccess,
pub offset_mode: SuGaOffsetMode,
#[src_type(GPR)]
pub format: Src,
#[src_type(SSA)]
pub addr: Src,
#[src_type(SSA)]
pub data: Src,
#[src_type(Pred)]
pub out_of_bounds: Src,
}
impl DisplayOp for OpSuStGa {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"sustga{} [{}] {} {} {}",
self.image_access,
self.addr,
self.format,
self.data,
self.out_of_bounds,
)
}
}
impl_display_for_op!(OpSuStGa);
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpLd {
@ -6676,6 +7339,12 @@ pub enum Op {
SuLd(OpSuLd),
SuSt(OpSuSt),
SuAtom(OpSuAtom),
SuClamp(OpSuClamp),
SuBfm(OpSuBfm),
SuEau(OpSuEau),
IMadSp(OpIMadSp),
SuLdGa(OpSuLdGa),
SuStGa(OpSuStGa),
Ld(OpLd),
Ldc(OpLdc),
LdSharedLock(OpLdSharedLock),
@ -6805,6 +7474,10 @@ impl Op {
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::SuClamp(_)
| Op::SuBfm(_)
| Op::SuEau(_)
| Op::IMadSp(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
@ -6834,7 +7507,11 @@ impl Op {
| Op::Txq(_) => false,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => false,
Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => false,
// Memory ops
Op::Ld(_)
@ -7232,7 +7909,11 @@ impl Instr {
Op::Atom(op) => op.mem_space != MemSpace::Local,
Op::Ld(op) => op.access.space != MemSpace::Local,
Op::St(op) => op.access.space != MemSpace::Local,
Op::SuAtom(_) | Op::SuLd(_) | Op::SuSt(_) => true,
Op::SuAtom(_)
| Op::SuLd(_)
| Op::SuSt(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => true,
_ => false,
}
}
@ -7241,7 +7922,7 @@ impl Instr {
match &self.op {
Op::Atom(op) => matches!(op.mem_space, MemSpace::Global(_)),
Op::St(op) => matches!(op.access.space, MemSpace::Global(_)),
Op::SuAtom(_) | Op::SuSt(_) => true,
Op::SuAtom(_) | Op::SuSt(_) | Op::SuStGa(_) => true,
_ => false,
}
}
@ -7250,6 +7931,7 @@ impl Instr {
match &self.op {
Op::ASt(_)
| Op::SuSt(_)
| Op::SuStGa(_)
| Op::SuAtom(_)
| Op::LdSharedLock(_)
| Op::St(_)

View file

@ -129,6 +129,10 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
| Op::LeaX(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::SuClamp(_)
| Op::SuBfm(_)
| Op::SuEau(_)
| Op::IMadSp(_)
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_)
@ -158,7 +162,11 @@ pub fn side_effect_type(op: &Op) -> SideEffect {
| Op::Txq(_) => SideEffect::Memory,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => SideEffect::Memory,
Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => SideEffect::Memory,
// Memory ops
Op::Ipa(_) | Op::Ldc(_) => SideEffect::None,
@ -262,7 +270,11 @@ pub fn estimate_variable_latency(sm: u8, op: &Op) -> u32 {
| Op::Txq(_) => 32,
// Surface ops
Op::SuLd(_) | Op::SuSt(_) | Op::SuAtom(_) => 32,
Op::SuLd(_)
| Op::SuSt(_)
| Op::SuAtom(_)
| Op::SuLdGa(_)
| Op::SuStGa(_) => 32,
// Memory ops
Op::Ldc(_) => 4,