nak: implement SHL and SHR on SM50

SHF.{L,R} is supported, but it seems to always write 0 to dst when the
shift value is a register. The only case in nak_from_nir that actually
uses the 64-bit shift is nir_op_isign, which has an immediate shift
value.

This also avoids the SHF.I32 issue, since the only usage is now SHF.I64.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26114>
This commit is contained in:
Benjamin Lee 2023-11-15 13:25:17 -08:00 committed by Marge Bot
parent 286b832f74
commit 00be041ffc
5 changed files with 176 additions and 37 deletions

View file

@ -121,6 +121,55 @@ pub trait Builder {
pub trait SSABuilder: Builder {
fn alloc_ssa(&mut self, file: RegFile, comps: u8) -> SSARef;
fn shl(&mut self, x: Src, shift: Src) -> SSARef {
let dst = self.alloc_ssa(RegFile::GPR, 1);
if self.sm() >= 70 {
self.push_op(OpShf {
dst: dst.into(),
low: x,
high: 0.into(),
shift: shift,
right: false,
wrap: false,
data_type: IntType::I32,
dst_high: false,
});
} else {
self.push_op(OpShl {
dst: dst.into(),
src: x,
shift: shift,
wrap: false,
});
}
dst
}
fn shr(&mut self, x: Src, shift: Src, signed: bool) -> SSARef {
let dst = self.alloc_ssa(RegFile::GPR, 1);
if self.sm() >= 70 {
self.push_op(OpShf {
dst: dst.into(),
low: 0.into(),
high: x,
shift: shift,
right: true,
wrap: false,
data_type: if signed { IntType::I32 } else { IntType::U32 },
dst_high: true,
});
} else {
self.push_op(OpShr {
dst: dst.into(),
src: x,
shift: shift,
wrap: false,
signed,
});
}
dst
}
fn fadd(&mut self, x: Src, y: Src) -> SSARef {
let dst = self.alloc_ssa(RegFile::GPR, 1);
self.push_op(OpFAdd {

View file

@ -544,6 +544,51 @@ impl SM50Instr {
self.set_bit(50, op.wrap);
}
fn encode_shl(&mut self, op: &OpShl) {
self.set_dst(op.dst);
self.set_reg_src(8..16, op.src);
match op.shift.src_ref {
SrcRef::Zero | SrcRef::Reg(_) => {
self.set_opcode(0x5c48);
self.set_reg_src(20..28, op.shift);
}
SrcRef::Imm32(i) => {
self.set_opcode(0x3848);
self.set_src_imm_i20(20..39, 56, i);
}
SrcRef::CBuf(cb) => {
self.set_opcode(0x4c48);
self.set_src_cb(20..39, &cb);
}
src1 => panic!("unsupported src1 type for SHL: {src1}"),
}
self.set_bit(39, op.wrap);
}
fn encode_shr(&mut self, op: &OpShr) {
self.set_dst(op.dst);
self.set_reg_src(8..16, op.src);
match op.shift.src_ref {
SrcRef::Zero | SrcRef::Reg(_) => {
self.set_opcode(0x5c28);
self.set_reg_src(20..28, op.shift);
}
SrcRef::Imm32(i) => {
self.set_opcode(0x3828);
self.set_src_imm_i20(20..39, 56, i);
}
SrcRef::CBuf(cb) => {
self.set_opcode(0x4c28);
self.set_src_cb(20..39, &cb);
}
src1 => panic!("unsupported src1 type for SHL: {src1}"),
}
self.set_bit(39, op.wrap);
self.set_bit(48, op.signed);
}
fn encode_i2f(&mut self, op: &OpI2F) {
let abs_bit = 49;
let neg_bit = 45;
@ -1604,6 +1649,8 @@ impl SM50Instr {
Op::St(op) => si.encode_st(&op),
Op::Lop2(op) => si.encode_lop2(&op),
Op::Shf(op) => si.encode_shf(&op),
Op::Shl(op) => si.encode_shl(&op),
Op::Shr(op) => si.encode_shr(&op),
Op::F2F(op) => si.encode_f2f(&op),
Op::F2I(op) => si.encode_f2i(&op),
Op::I2F(op) => si.encode_i2f(&op),

View file

@ -899,18 +899,7 @@ impl<'a> ShaderFromNir<'a> {
dst
} else {
assert!(alu.def.bit_size() == 32);
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShf {
dst: dst.into(),
low: x.into(),
high: 0.into(),
shift: shift,
right: false,
wrap: true,
data_type: IntType::U32,
dst_high: false,
});
dst
b.shl(srcs[0], srcs[1])
}
}
nir_op_ishr => {
@ -944,18 +933,7 @@ impl<'a> ShaderFromNir<'a> {
dst
} else {
assert!(alu.def.bit_size() == 32);
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShf {
dst: dst.into(),
low: 0.into(),
high: x.into(),
shift: shift,
right: true,
wrap: true,
data_type: IntType::I32,
dst_high: true,
});
dst
b.shr(srcs[0], srcs[1], true)
}
}
nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
@ -1153,18 +1131,7 @@ impl<'a> ShaderFromNir<'a> {
dst
} else {
assert!(alu.def.bit_size() == 32);
let dst = b.alloc_ssa(RegFile::GPR, 1);
b.push_op(OpShf {
dst: dst.into(),
low: x.into(),
high: 0.into(),
shift: shift,
right: true,
wrap: true,
data_type: IntType::U32,
dst_high: false,
});
dst
b.shr(srcs[0], srcs[1], false)
}
}
nir_op_fddx | nir_op_fddx_coarse | nir_op_fddx_fine => {

View file

@ -2816,6 +2816,60 @@ impl DisplayOp for OpShf {
}
impl_display_for_op!(OpShf);
/// Only used on SM50
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpShl {
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
#[src_type(ALU)]
pub shift: Src,
pub wrap: bool,
}
impl DisplayOp for OpShl {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shl")?;
if self.wrap {
write!(f, ".w")?;
}
write!(f, " {} {}", self.src, self.shift)
}
}
/// Only used on SM50
#[repr(C)]
#[derive(SrcsAsSlice, DstsAsSlice)]
pub struct OpShr {
pub dst: Dst,
#[src_type(GPR)]
pub src: Src,
#[src_type(ALU)]
pub shift: Src,
pub wrap: bool,
pub signed: bool,
}
impl DisplayOp for OpShr {
fn fmt_op(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "shr")?;
if self.wrap {
write!(f, ".w")?;
}
if !self.signed {
write!(f, ".u32")?;
}
write!(f, " {} {}", self.src, self.shift)
}
}
#[repr(C)]
#[derive(DstsAsSlice)]
pub struct OpF2F {
@ -4511,6 +4565,8 @@ pub enum Op {
Lop3(OpLop3),
PopC(OpPopC),
Shf(OpShf),
Shl(OpShl),
Shr(OpShr),
F2F(OpF2F),
F2I(OpF2I),
I2F(OpI2F),
@ -4945,7 +5001,9 @@ impl Instr {
| Op::ISetP(_)
| Op::Lop2(_)
| Op::Lop3(_)
| Op::Shf(_) => true,
| Op::Shf(_)
| Op::Shl(_)
| Op::Shr(_) => true,
// Conversions are variable latency?!?
Op::F2F(_) | Op::F2I(_) | Op::I2F(_) | Op::FRnd(_) => false,

View file

@ -72,6 +72,16 @@ fn swap_srcs_if_not_reg(x: &mut Src, y: &mut Src) -> bool {
}
}
fn copy_src_if_i20_overflow(
b: &mut impl SSABuilder,
src: &mut Src,
file: RegFile,
) {
if src.as_imm_not_i20().is_some() {
copy_src(b, src, file);
}
}
fn legalize_sm50_instr(
b: &mut impl SSABuilder,
_bl: &impl BlockLiveness,
@ -83,6 +93,14 @@ fn legalize_sm50_instr(
copy_src_if_not_reg(b, &mut op.shift, RegFile::GPR);
copy_src_if_not_reg(b, &mut op.high, RegFile::GPR);
}
Op::Shl(op) => {
copy_src_if_not_reg(b, &mut op.src, RegFile::GPR);
copy_src_if_i20_overflow(b, &mut op.shift, RegFile::GPR);
}
Op::Shr(op) => {
copy_src_if_not_reg(b, &mut op.src, RegFile::GPR);
copy_src_if_i20_overflow(b, &mut op.shift, RegFile::GPR);
}
Op::FAdd(op) => {
let [ref mut src0, ref mut src1] = op.srcs;
swap_srcs_if_not_reg(src0, src1);