mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 07:08:04 +02:00
nak: add input predicate to load_global_nv and OpLd
This is new in SM75 (Turing). Let's use it because it allows us to get rid of the if/else around bound checked global loads. There are some changes in fossils, but it seems that's mostly due to CFG optimizations doing things a bit differently? Totals: CodeSize: 9442152688 -> 9442133184 (-0.00%); split: -0.00%, +0.00% Static cycle count: 6120910991 -> 6120907718 (-0.00%); split: -0.00%, +0.00% Spills to reg: 184789 -> 184810 (+0.01%) Fills from reg: 223831 -> 223860 (+0.01%); split: -0.00%, +0.01% Totals from 334 (0.03% of 1163204) affected shaders: CodeSize: 22020752 -> 22001248 (-0.09%); split: -0.10%, +0.01% Static cycle count: 26582978 -> 26579705 (-0.01%); split: -0.01%, +0.00% Spills to reg: 3110 -> 3131 (+0.68%) Fills from reg: 3401 -> 3430 (+0.85%); split: -0.03%, +0.88% Reviewed-by: Mary Guillemard <mary@mary.zone> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Acked-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40272>
This commit is contained in:
parent
d2bf824baf
commit
9d90cbc314
13 changed files with 67 additions and 6 deletions
|
|
@ -1845,7 +1845,8 @@ store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRIT
|
||||||
|
|
||||||
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
|
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
|
||||||
# signed otherwise.
|
# signed otherwise.
|
||||||
load("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
# load_global_nv has an additional boolean input that makes the load return 0 on false.
|
||||||
|
load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||||
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||||
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||||
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
|
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
|
||||||
|
|
|
||||||
|
|
@ -756,6 +756,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
|
||||||
int32_t min = ~BITFIELD_MASK(const_bits - 1);
|
int32_t min = ~BITFIELD_MASK(const_bits - 1);
|
||||||
validate_assert(state, base >= min && base < max);
|
validate_assert(state, base >= min && base < max);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (instr->intrinsic == nir_intrinsic_load_global_nv) {
|
||||||
|
validate_assert(state, instr->src[1].ssa->bit_size == 1);
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3071,11 +3071,13 @@ impl<'a> ShaderFromNir<'a> {
|
||||||
.get_eviction_priority(intrin.access()),
|
.get_eviction_priority(intrin.access()),
|
||||||
};
|
};
|
||||||
let addr = self.get_src(&srcs[0]);
|
let addr = self.get_src(&srcs[0]);
|
||||||
|
let pred = self.get_src(&srcs[1]);
|
||||||
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
||||||
|
|
||||||
b.push_op(OpLd {
|
b.push_op(OpLd {
|
||||||
dst: dst.clone().into(),
|
dst: dst.clone().into(),
|
||||||
addr: addr,
|
addr: addr,
|
||||||
|
pred: pred,
|
||||||
offset: intrin.base(),
|
offset: intrin.base(),
|
||||||
stride: OffsetStride::X1,
|
stride: OffsetStride::X1,
|
||||||
access: access,
|
access: access,
|
||||||
|
|
@ -3186,6 +3188,7 @@ impl<'a> ShaderFromNir<'a> {
|
||||||
b.push_op(OpLd {
|
b.push_op(OpLd {
|
||||||
dst: dst.clone().into(),
|
dst: dst.clone().into(),
|
||||||
addr: addr,
|
addr: addr,
|
||||||
|
pred: true.into(),
|
||||||
offset: intrin.base(),
|
offset: intrin.base(),
|
||||||
stride: OffsetStride::X1,
|
stride: OffsetStride::X1,
|
||||||
access: access,
|
access: access,
|
||||||
|
|
@ -3208,6 +3211,7 @@ impl<'a> ShaderFromNir<'a> {
|
||||||
b.push_op(OpLd {
|
b.push_op(OpLd {
|
||||||
dst: dst.clone().into(),
|
dst: dst.clone().into(),
|
||||||
addr: addr,
|
addr: addr,
|
||||||
|
pred: true.into(),
|
||||||
offset: intrin.base(),
|
offset: intrin.base(),
|
||||||
stride: intrin.offset_shift_nv().try_into().unwrap(),
|
stride: intrin.offset_shift_nv().try_into().unwrap(),
|
||||||
access: access,
|
access: access,
|
||||||
|
|
|
||||||
|
|
@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> {
|
||||||
self.push_op(OpLd {
|
self.push_op(OpLd {
|
||||||
dst: dst.clone().into(),
|
dst: dst.clone().into(),
|
||||||
addr: self.data_addr.clone().into(),
|
addr: self.data_addr.clone().into(),
|
||||||
|
pred: true.into(),
|
||||||
offset: offset.into(),
|
offset: offset.into(),
|
||||||
access: access,
|
access: access,
|
||||||
stride: OffsetStride::X1,
|
stride: OffsetStride::X1,
|
||||||
|
|
|
||||||
|
|
@ -6459,6 +6459,10 @@ pub struct OpLd {
|
||||||
#[src_type(GPR)]
|
#[src_type(GPR)]
|
||||||
pub addr: Src,
|
pub addr: Src,
|
||||||
|
|
||||||
|
/// On false the load returns 0
|
||||||
|
#[src_type(Pred)]
|
||||||
|
pub pred: Src,
|
||||||
|
|
||||||
pub offset: i32,
|
pub offset: i32,
|
||||||
pub stride: OffsetStride,
|
pub stride: OffsetStride,
|
||||||
pub access: MemAccess,
|
pub access: MemAccess,
|
||||||
|
|
@ -6470,7 +6474,7 @@ impl DisplayOp for OpLd {
|
||||||
if self.offset > 0 {
|
if self.offset > 0 {
|
||||||
write!(f, "+{:#x}", self.offset)?;
|
write!(f, "+{:#x}", self.offset)?;
|
||||||
}
|
}
|
||||||
write!(f, "]")
|
write!(f, "], {}", self.pred)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl_display_for_op!(OpLd);
|
impl_display_for_op!(OpLd);
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,7 @@ impl LowerCopySwap {
|
||||||
b.push_op(OpLd {
|
b.push_op(OpLd {
|
||||||
dst: copy.dst,
|
dst: copy.dst,
|
||||||
addr: Src::ZERO,
|
addr: Src::ZERO,
|
||||||
|
pred: true.into(),
|
||||||
offset: addr.try_into().unwrap(),
|
offset: addr.try_into().unwrap(),
|
||||||
stride: OffsetStride::X1,
|
stride: OffsetStride::X1,
|
||||||
access: access,
|
access: access,
|
||||||
|
|
|
||||||
|
|
@ -291,6 +291,7 @@ pub fn test_ld_st_atom() {
|
||||||
let r1 = RegRef::new(RegFile::GPR, 1, 1);
|
let r1 = RegRef::new(RegFile::GPR, 1, 1);
|
||||||
let r2 = RegRef::new(RegFile::GPR, 2, 1);
|
let r2 = RegRef::new(RegFile::GPR, 2, 1);
|
||||||
let r3 = RegRef::new(RegFile::GPR, 3, 1);
|
let r3 = RegRef::new(RegFile::GPR, 3, 1);
|
||||||
|
let p4 = RegRef::new(RegFile::Pred, 4, 1);
|
||||||
|
|
||||||
let order = MemOrder::Strong(MemScope::CTA);
|
let order = MemOrder::Strong(MemScope::CTA);
|
||||||
|
|
||||||
|
|
@ -338,11 +339,23 @@ pub fn test_ld_st_atom() {
|
||||||
let instr = OpLd {
|
let instr = OpLd {
|
||||||
dst: Dst::Reg(r0),
|
dst: Dst::Reg(r0),
|
||||||
addr: SrcRef::Reg(r1).into(),
|
addr: SrcRef::Reg(r1).into(),
|
||||||
|
pred: if matches!(space, MemSpace::Global(_))
|
||||||
|
&& sm >= 73
|
||||||
|
{
|
||||||
|
SrcRef::Reg(p4).into()
|
||||||
|
} else {
|
||||||
|
true.into()
|
||||||
|
},
|
||||||
offset: addr_offset,
|
offset: addr_offset,
|
||||||
access: access.clone(),
|
access: access.clone(),
|
||||||
stride: addr_stride,
|
stride: addr_stride,
|
||||||
};
|
};
|
||||||
let expected = match space {
|
let expected = match space {
|
||||||
|
MemSpace::Global(_) if sm >= 73 => {
|
||||||
|
format!(
|
||||||
|
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;"
|
||||||
|
)
|
||||||
|
}
|
||||||
MemSpace::Global(_) => {
|
MemSpace::Global(_) => {
|
||||||
format!(
|
format!(
|
||||||
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}];"
|
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}];"
|
||||||
|
|
|
||||||
|
|
@ -2310,6 +2310,7 @@ impl SM20Op for OpLd {
|
||||||
|
|
||||||
fn encode(&self, e: &mut SM20Encoder<'_>) {
|
fn encode(&self, e: &mut SM20Encoder<'_>) {
|
||||||
assert_eq!(self.stride, OffsetStride::X1);
|
assert_eq!(self.stride, OffsetStride::X1);
|
||||||
|
assert!(self.pred.is_true());
|
||||||
match self.access.space {
|
match self.access.space {
|
||||||
MemSpace::Global(addr_type) => {
|
MemSpace::Global(addr_type) => {
|
||||||
e.set_opcode(SM20Unit::Mem, 0x20);
|
e.set_opcode(SM20Unit::Mem, 0x20);
|
||||||
|
|
|
||||||
|
|
@ -2550,6 +2550,7 @@ impl SM32Op for OpLd {
|
||||||
|
|
||||||
fn encode(&self, e: &mut SM32Encoder<'_>) {
|
fn encode(&self, e: &mut SM32Encoder<'_>) {
|
||||||
assert_eq!(self.stride, OffsetStride::X1);
|
assert_eq!(self.stride, OffsetStride::X1);
|
||||||
|
assert!(self.pred.is_true());
|
||||||
// Missing:
|
// Missing:
|
||||||
// 0x7c8 for indirect const load
|
// 0x7c8 for indirect const load
|
||||||
match self.access.space {
|
match self.access.space {
|
||||||
|
|
|
||||||
|
|
@ -2598,6 +2598,7 @@ impl SM50Op for OpLd {
|
||||||
|
|
||||||
fn encode(&self, e: &mut SM50Encoder<'_>) {
|
fn encode(&self, e: &mut SM50Encoder<'_>) {
|
||||||
assert_eq!(self.stride, OffsetStride::X1);
|
assert_eq!(self.stride, OffsetStride::X1);
|
||||||
|
assert!(self.pred.is_true());
|
||||||
e.set_opcode(match self.access.space {
|
e.set_opcode(match self.access.space {
|
||||||
MemSpace::Global(_) => 0xeed0,
|
MemSpace::Global(_) => 0xeed0,
|
||||||
MemSpace::Local => 0xef40,
|
MemSpace::Local => 0xef40,
|
||||||
|
|
|
||||||
|
|
@ -150,13 +150,13 @@ impl SM70Encoder<'_> {
|
||||||
self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
|
self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_rev_upred_src(
|
fn set_rev_pred_src_file(
|
||||||
&mut self,
|
&mut self,
|
||||||
range: Range<usize>,
|
range: Range<usize>,
|
||||||
not_bit: usize,
|
not_bit: usize,
|
||||||
src: &Src,
|
src: &Src,
|
||||||
|
file: RegFile,
|
||||||
) {
|
) {
|
||||||
let file = RegFile::UPred;
|
|
||||||
let (not, reg) = match src.src_ref {
|
let (not, reg) = match src.src_ref {
|
||||||
SrcRef::True => (false, self.true_reg(file)),
|
SrcRef::True => (false, self.true_reg(file)),
|
||||||
SrcRef::False => (true, self.true_reg(file)),
|
SrcRef::False => (true, self.true_reg(file)),
|
||||||
|
|
@ -177,6 +177,24 @@ impl SM70Encoder<'_> {
|
||||||
self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
|
self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_rev_pred_src(
|
||||||
|
&mut self,
|
||||||
|
range: Range<usize>,
|
||||||
|
not_bit: usize,
|
||||||
|
src: &Src,
|
||||||
|
) {
|
||||||
|
self.set_rev_pred_src_file(range, not_bit, src, RegFile::Pred)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_rev_upred_src(
|
||||||
|
&mut self,
|
||||||
|
range: Range<usize>,
|
||||||
|
not_bit: usize,
|
||||||
|
src: &Src,
|
||||||
|
) {
|
||||||
|
self.set_rev_pred_src_file(range, not_bit, src, RegFile::UPred)
|
||||||
|
}
|
||||||
|
|
||||||
fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
|
fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
|
||||||
let mut v = BitMutView::new_subset(self, range);
|
let mut v = BitMutView::new_subset(self, range);
|
||||||
v.set_field(6..22, cb.offset);
|
v.set_field(6..22, cb.offset);
|
||||||
|
|
@ -3019,6 +3037,7 @@ impl SM70Op for OpSuAtom {
|
||||||
impl SM70Op for OpLd {
|
impl SM70Op for OpLd {
|
||||||
fn legalize(&mut self, b: &mut LegalizeBuilder) {
|
fn legalize(&mut self, b: &mut LegalizeBuilder) {
|
||||||
b.copy_src_if_uniform(&mut self.addr);
|
b.copy_src_if_uniform(&mut self.addr);
|
||||||
|
b.copy_src_if_uniform(&mut self.pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn encode(&self, e: &mut SM70Encoder<'_>) {
|
fn encode(&self, e: &mut SM70Encoder<'_>) {
|
||||||
|
|
@ -3026,10 +3045,16 @@ impl SM70Op for OpLd {
|
||||||
MemSpace::Global(_) => {
|
MemSpace::Global(_) => {
|
||||||
e.set_opcode(0x381);
|
e.set_opcode(0x381);
|
||||||
assert_eq!(self.stride, OffsetStride::X1);
|
assert_eq!(self.stride, OffsetStride::X1);
|
||||||
|
if e.sm >= 73 {
|
||||||
|
e.set_rev_pred_src(64..67, 67, &self.pred);
|
||||||
|
} else {
|
||||||
|
assert!(self.pred.is_true());
|
||||||
|
}
|
||||||
e.set_pred_dst(81..84, &Dst::None);
|
e.set_pred_dst(81..84, &Dst::None);
|
||||||
e.set_mem_access(&self.access);
|
e.set_mem_access(&self.access);
|
||||||
}
|
}
|
||||||
MemSpace::Local => {
|
MemSpace::Local => {
|
||||||
|
assert!(self.pred.is_true());
|
||||||
assert_eq!(self.stride, OffsetStride::X1);
|
assert_eq!(self.stride, OffsetStride::X1);
|
||||||
e.set_opcode(0x983);
|
e.set_opcode(0x983);
|
||||||
e.set_field(84..87, 1_u8);
|
e.set_field(84..87, 1_u8);
|
||||||
|
|
@ -3043,6 +3068,7 @@ impl SM70Op for OpLd {
|
||||||
}
|
}
|
||||||
MemSpace::Shared => {
|
MemSpace::Shared => {
|
||||||
e.set_opcode(0x984);
|
e.set_opcode(0x984);
|
||||||
|
assert!(self.pred.is_true());
|
||||||
|
|
||||||
e.set_mem_type(73..76, self.access.mem_type);
|
e.set_mem_type(73..76, self.access.mem_type);
|
||||||
assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
|
assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
|
||||||
|
|
|
||||||
|
|
@ -1014,9 +1014,11 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
||||||
|
|
||||||
switch (intr->intrinsic) {
|
switch (intr->intrinsic) {
|
||||||
case nir_intrinsic_load_global:
|
case nir_intrinsic_load_global:
|
||||||
case nir_intrinsic_load_global_constant:
|
case nir_intrinsic_load_global_constant: {
|
||||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
nir_def *nir_true = nir_imm_bool(&b, true);
|
||||||
|
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case nir_intrinsic_load_scratch:
|
case nir_intrinsic_load_scratch:
|
||||||
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,7 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
|
||||||
val = nir_load_global_nv(b,
|
val = nir_load_global_nv(b,
|
||||||
load->def.num_components, load->def.bit_size,
|
load->def.num_components, load->def.bit_size,
|
||||||
nir_iadd(b, addr, nir_u2u64(b, offset)),
|
nir_iadd(b, addr, nir_u2u64(b, offset)),
|
||||||
|
nir_imm_bool(b, true),
|
||||||
.align_mul = nir_intrinsic_align_mul(load),
|
.align_mul = nir_intrinsic_align_mul(load),
|
||||||
.align_offset = nir_intrinsic_align_offset(load),
|
.align_offset = nir_intrinsic_align_offset(load),
|
||||||
.access = ACCESS_CAN_REORDER,
|
.access = ACCESS_CAN_REORDER,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue