mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-13 19:40:34 +01:00
nak: add input predicate to load_global_nv and OpLd
This is new in SM75 (Turing). Let's use it because it allows us to get rid of the if/else around bound checked global loads. There are some changes in fossils, but it seems that's mostly due to CFG optimizations doing things a bit differently? Totals: CodeSize: 9442152688 -> 9442133184 (-0.00%); split: -0.00%, +0.00% Static cycle count: 6120910991 -> 6120907718 (-0.00%); split: -0.00%, +0.00% Spills to reg: 184789 -> 184810 (+0.01%) Fills from reg: 223831 -> 223860 (+0.01%); split: -0.00%, +0.01% Totals from 334 (0.03% of 1163204) affected shaders: CodeSize: 22020752 -> 22001248 (-0.09%); split: -0.10%, +0.01% Static cycle count: 26582978 -> 26579705 (-0.01%); split: -0.01%, +0.00% Spills to reg: 3110 -> 3131 (+0.68%) Fills from reg: 3401 -> 3430 (+0.85%); split: -0.03%, +0.88% Reviewed-by: Mary Guillemard <mary@mary.zone> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com> Acked-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40272>
This commit is contained in:
parent
d2bf824baf
commit
9d90cbc314
13 changed files with 67 additions and 6 deletions
|
|
@ -1845,7 +1845,8 @@ store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRIT
|
|||
|
||||
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
|
||||
# signed otherwise.
|
||||
load("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
# load_global_nv has an additional boolean input that makes the load return 0 on false.
|
||||
load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
|
||||
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
|
||||
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])
|
||||
|
|
|
|||
|
|
@ -756,6 +756,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
|
|||
int32_t min = ~BITFIELD_MASK(const_bits - 1);
|
||||
validate_assert(state, base >= min && base < max);
|
||||
}
|
||||
|
||||
if (instr->intrinsic == nir_intrinsic_load_global_nv) {
|
||||
validate_assert(state, instr->src[1].ssa->bit_size == 1);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3071,11 +3071,13 @@ impl<'a> ShaderFromNir<'a> {
|
|||
.get_eviction_priority(intrin.access()),
|
||||
};
|
||||
let addr = self.get_src(&srcs[0]);
|
||||
let pred = self.get_src(&srcs[1]);
|
||||
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
|
||||
|
||||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
pred: pred,
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
access: access,
|
||||
|
|
@ -3186,6 +3188,7 @@ impl<'a> ShaderFromNir<'a> {
|
|||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
pred: true.into(),
|
||||
offset: intrin.base(),
|
||||
stride: OffsetStride::X1,
|
||||
access: access,
|
||||
|
|
@ -3208,6 +3211,7 @@ impl<'a> ShaderFromNir<'a> {
|
|||
b.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: addr,
|
||||
pred: true.into(),
|
||||
offset: intrin.base(),
|
||||
stride: intrin.offset_shift_nv().try_into().unwrap(),
|
||||
access: access,
|
||||
|
|
|
|||
|
|
@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> {
|
|||
self.push_op(OpLd {
|
||||
dst: dst.clone().into(),
|
||||
addr: self.data_addr.clone().into(),
|
||||
pred: true.into(),
|
||||
offset: offset.into(),
|
||||
access: access,
|
||||
stride: OffsetStride::X1,
|
||||
|
|
|
|||
|
|
@ -6459,6 +6459,10 @@ pub struct OpLd {
|
|||
#[src_type(GPR)]
|
||||
pub addr: Src,
|
||||
|
||||
/// On false the load returns 0
|
||||
#[src_type(Pred)]
|
||||
pub pred: Src,
|
||||
|
||||
pub offset: i32,
|
||||
pub stride: OffsetStride,
|
||||
pub access: MemAccess,
|
||||
|
|
@ -6470,7 +6474,7 @@ impl DisplayOp for OpLd {
|
|||
if self.offset > 0 {
|
||||
write!(f, "+{:#x}", self.offset)?;
|
||||
}
|
||||
write!(f, "]")
|
||||
write!(f, "], {}", self.pred)
|
||||
}
|
||||
}
|
||||
impl_display_for_op!(OpLd);
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ impl LowerCopySwap {
|
|||
b.push_op(OpLd {
|
||||
dst: copy.dst,
|
||||
addr: Src::ZERO,
|
||||
pred: true.into(),
|
||||
offset: addr.try_into().unwrap(),
|
||||
stride: OffsetStride::X1,
|
||||
access: access,
|
||||
|
|
|
|||
|
|
@ -291,6 +291,7 @@ pub fn test_ld_st_atom() {
|
|||
let r1 = RegRef::new(RegFile::GPR, 1, 1);
|
||||
let r2 = RegRef::new(RegFile::GPR, 2, 1);
|
||||
let r3 = RegRef::new(RegFile::GPR, 3, 1);
|
||||
let p4 = RegRef::new(RegFile::Pred, 4, 1);
|
||||
|
||||
let order = MemOrder::Strong(MemScope::CTA);
|
||||
|
||||
|
|
@ -338,11 +339,23 @@ pub fn test_ld_st_atom() {
|
|||
let instr = OpLd {
|
||||
dst: Dst::Reg(r0),
|
||||
addr: SrcRef::Reg(r1).into(),
|
||||
pred: if matches!(space, MemSpace::Global(_))
|
||||
&& sm >= 73
|
||||
{
|
||||
SrcRef::Reg(p4).into()
|
||||
} else {
|
||||
true.into()
|
||||
},
|
||||
offset: addr_offset,
|
||||
access: access.clone(),
|
||||
stride: addr_stride,
|
||||
};
|
||||
let expected = match space {
|
||||
MemSpace::Global(_) if sm >= 73 => {
|
||||
format!(
|
||||
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;"
|
||||
)
|
||||
}
|
||||
MemSpace::Global(_) => {
|
||||
format!(
|
||||
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}];"
|
||||
|
|
|
|||
|
|
@ -2310,6 +2310,7 @@ impl SM20Op for OpLd {
|
|||
|
||||
fn encode(&self, e: &mut SM20Encoder<'_>) {
|
||||
assert_eq!(self.stride, OffsetStride::X1);
|
||||
assert!(self.pred.is_true());
|
||||
match self.access.space {
|
||||
MemSpace::Global(addr_type) => {
|
||||
e.set_opcode(SM20Unit::Mem, 0x20);
|
||||
|
|
|
|||
|
|
@ -2550,6 +2550,7 @@ impl SM32Op for OpLd {
|
|||
|
||||
fn encode(&self, e: &mut SM32Encoder<'_>) {
|
||||
assert_eq!(self.stride, OffsetStride::X1);
|
||||
assert!(self.pred.is_true());
|
||||
// Missing:
|
||||
// 0x7c8 for indirect const load
|
||||
match self.access.space {
|
||||
|
|
|
|||
|
|
@ -2598,6 +2598,7 @@ impl SM50Op for OpLd {
|
|||
|
||||
fn encode(&self, e: &mut SM50Encoder<'_>) {
|
||||
assert_eq!(self.stride, OffsetStride::X1);
|
||||
assert!(self.pred.is_true());
|
||||
e.set_opcode(match self.access.space {
|
||||
MemSpace::Global(_) => 0xeed0,
|
||||
MemSpace::Local => 0xef40,
|
||||
|
|
|
|||
|
|
@ -150,13 +150,13 @@ impl SM70Encoder<'_> {
|
|||
self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
|
||||
}
|
||||
|
||||
fn set_rev_upred_src(
|
||||
fn set_rev_pred_src_file(
|
||||
&mut self,
|
||||
range: Range<usize>,
|
||||
not_bit: usize,
|
||||
src: &Src,
|
||||
file: RegFile,
|
||||
) {
|
||||
let file = RegFile::UPred;
|
||||
let (not, reg) = match src.src_ref {
|
||||
SrcRef::True => (false, self.true_reg(file)),
|
||||
SrcRef::False => (true, self.true_reg(file)),
|
||||
|
|
@ -177,6 +177,24 @@ impl SM70Encoder<'_> {
|
|||
self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
|
||||
}
|
||||
|
||||
fn set_rev_pred_src(
|
||||
&mut self,
|
||||
range: Range<usize>,
|
||||
not_bit: usize,
|
||||
src: &Src,
|
||||
) {
|
||||
self.set_rev_pred_src_file(range, not_bit, src, RegFile::Pred)
|
||||
}
|
||||
|
||||
fn set_rev_upred_src(
|
||||
&mut self,
|
||||
range: Range<usize>,
|
||||
not_bit: usize,
|
||||
src: &Src,
|
||||
) {
|
||||
self.set_rev_pred_src_file(range, not_bit, src, RegFile::UPred)
|
||||
}
|
||||
|
||||
fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
|
||||
let mut v = BitMutView::new_subset(self, range);
|
||||
v.set_field(6..22, cb.offset);
|
||||
|
|
@ -3019,6 +3037,7 @@ impl SM70Op for OpSuAtom {
|
|||
impl SM70Op for OpLd {
|
||||
fn legalize(&mut self, b: &mut LegalizeBuilder) {
|
||||
b.copy_src_if_uniform(&mut self.addr);
|
||||
b.copy_src_if_uniform(&mut self.pred);
|
||||
}
|
||||
|
||||
fn encode(&self, e: &mut SM70Encoder<'_>) {
|
||||
|
|
@ -3026,10 +3045,16 @@ impl SM70Op for OpLd {
|
|||
MemSpace::Global(_) => {
|
||||
e.set_opcode(0x381);
|
||||
assert_eq!(self.stride, OffsetStride::X1);
|
||||
if e.sm >= 73 {
|
||||
e.set_rev_pred_src(64..67, 67, &self.pred);
|
||||
} else {
|
||||
assert!(self.pred.is_true());
|
||||
}
|
||||
e.set_pred_dst(81..84, &Dst::None);
|
||||
e.set_mem_access(&self.access);
|
||||
}
|
||||
MemSpace::Local => {
|
||||
assert!(self.pred.is_true());
|
||||
assert_eq!(self.stride, OffsetStride::X1);
|
||||
e.set_opcode(0x983);
|
||||
e.set_field(84..87, 1_u8);
|
||||
|
|
@ -3043,6 +3068,7 @@ impl SM70Op for OpLd {
|
|||
}
|
||||
MemSpace::Shared => {
|
||||
e.set_opcode(0x984);
|
||||
assert!(self.pred.is_true());
|
||||
|
||||
e.set_mem_type(73..76, self.access.mem_type);
|
||||
assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
|
||||
|
|
|
|||
|
|
@ -1014,9 +1014,11 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
|
|||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_global:
|
||||
case nir_intrinsic_load_global_constant:
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
||||
case nir_intrinsic_load_global_constant: {
|
||||
nir_def *nir_true = nir_imm_bool(&b, true);
|
||||
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_scratch:
|
||||
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
|
|||
val = nir_load_global_nv(b,
|
||||
load->def.num_components, load->def.bit_size,
|
||||
nir_iadd(b, addr, nir_u2u64(b, offset)),
|
||||
nir_imm_bool(b, true),
|
||||
.align_mul = nir_intrinsic_align_mul(load),
|
||||
.align_offset = nir_intrinsic_align_offset(load),
|
||||
.access = ACCESS_CAN_REORDER,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue