nak: add input predicate to load_global_nv and OpLd

This is new in SM75 (Turing). Let's use it because it allows us to get rid
of the if/else around bound checked global loads.

There are some changes in fossils, but it seems that's mostly due to CFG
optimizations doing things a bit differently?

Totals:
CodeSize: 9442152688 -> 9442133184 (-0.00%); split: -0.00%, +0.00%
Static cycle count: 6120910991 -> 6120907718 (-0.00%); split: -0.00%, +0.00%
Spills to reg: 184789 -> 184810 (+0.01%)
Fills from reg: 223831 -> 223860 (+0.01%); split: -0.00%, +0.01%

Totals from 334 (0.03% of 1163204) affected shaders:
CodeSize: 22020752 -> 22001248 (-0.09%); split: -0.10%, +0.01%
Static cycle count: 26582978 -> 26579705 (-0.01%); split: -0.01%, +0.00%
Spills to reg: 3110 -> 3131 (+0.68%)
Fills from reg: 3401 -> 3430 (+0.85%); split: -0.03%, +0.88%

Reviewed-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Acked-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40272>
This commit is contained in:
Karol Herbst 2026-03-05 14:53:29 +01:00 committed by Marge Bot
parent d2bf824baf
commit 9d90cbc314
13 changed files with 67 additions and 6 deletions

View file

@ -1845,7 +1845,8 @@ store("global_amd", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET, WRIT
# src[] = { address }. BASE is a 24 bit unsigned offset if a constant 0 address is given,
# signed otherwise.
load("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
# load_global_nv has an additional boolean input that makes the load return 0 on false.
load("global_nv", [1, 1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("global_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
load("scratch_nv", [1], indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE])
store("scratch_nv", [1], indices=[BASE, ALIGN_MUL, ALIGN_OFFSET])

View file

@ -756,6 +756,11 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
int32_t min = ~BITFIELD_MASK(const_bits - 1);
validate_assert(state, base >= min && base < max);
}
if (instr->intrinsic == nir_intrinsic_load_global_nv) {
validate_assert(state, instr->src[1].ssa->bit_size == 1);
}
break;
}

View file

@ -3071,11 +3071,13 @@ impl<'a> ShaderFromNir<'a> {
.get_eviction_priority(intrin.access()),
};
let addr = self.get_src(&srcs[0]);
let pred = self.get_src(&srcs[1]);
let dst = b.alloc_ssa_vec(RegFile::GPR, size_B.div_ceil(4));
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
pred: pred,
offset: intrin.base(),
stride: OffsetStride::X1,
access: access,
@ -3186,6 +3188,7 @@ impl<'a> ShaderFromNir<'a> {
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
pred: true.into(),
offset: intrin.base(),
stride: OffsetStride::X1,
access: access,
@ -3208,6 +3211,7 @@ impl<'a> ShaderFromNir<'a> {
b.push_op(OpLd {
dst: dst.clone().into(),
addr: addr,
pred: true.into(),
offset: intrin.base(),
stride: intrin.offset_shift_nv().try_into().unwrap(),
access: access,

View file

@ -154,6 +154,7 @@ impl<'a> TestShaderBuilder<'a> {
self.push_op(OpLd {
dst: dst.clone().into(),
addr: self.data_addr.clone().into(),
pred: true.into(),
offset: offset.into(),
access: access,
stride: OffsetStride::X1,

View file

@ -6459,6 +6459,10 @@ pub struct OpLd {
#[src_type(GPR)]
pub addr: Src,
/// On false the load returns 0
#[src_type(Pred)]
pub pred: Src,
pub offset: i32,
pub stride: OffsetStride,
pub access: MemAccess,
@ -6470,7 +6474,7 @@ impl DisplayOp for OpLd {
if self.offset > 0 {
write!(f, "+{:#x}", self.offset)?;
}
write!(f, "]")
write!(f, "], {}", self.pred)
}
}
impl_display_for_op!(OpLd);

View file

@ -95,6 +95,7 @@ impl LowerCopySwap {
b.push_op(OpLd {
dst: copy.dst,
addr: Src::ZERO,
pred: true.into(),
offset: addr.try_into().unwrap(),
stride: OffsetStride::X1,
access: access,

View file

@ -291,6 +291,7 @@ pub fn test_ld_st_atom() {
let r1 = RegRef::new(RegFile::GPR, 1, 1);
let r2 = RegRef::new(RegFile::GPR, 2, 1);
let r3 = RegRef::new(RegFile::GPR, 3, 1);
let p4 = RegRef::new(RegFile::Pred, 4, 1);
let order = MemOrder::Strong(MemScope::CTA);
@ -338,11 +339,23 @@ pub fn test_ld_st_atom() {
let instr = OpLd {
dst: Dst::Reg(r0),
addr: SrcRef::Reg(r1).into(),
pred: if matches!(space, MemSpace::Global(_))
&& sm >= 73
{
SrcRef::Reg(p4).into()
} else {
true.into()
},
offset: addr_offset,
access: access.clone(),
stride: addr_stride,
};
let expected = match space {
MemSpace::Global(_) if sm >= 73 => {
format!(
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}], p4;"
)
}
MemSpace::Global(_) => {
format!(
"ldg.e.ef.strong.{cta} r0, [r1+{addr_offset_str}];"

View file

@ -2310,6 +2310,7 @@ impl SM20Op for OpLd {
fn encode(&self, e: &mut SM20Encoder<'_>) {
assert_eq!(self.stride, OffsetStride::X1);
assert!(self.pred.is_true());
match self.access.space {
MemSpace::Global(addr_type) => {
e.set_opcode(SM20Unit::Mem, 0x20);

View file

@ -2550,6 +2550,7 @@ impl SM32Op for OpLd {
fn encode(&self, e: &mut SM32Encoder<'_>) {
assert_eq!(self.stride, OffsetStride::X1);
assert!(self.pred.is_true());
// Missing:
// 0x7c8 for indirect const load
match self.access.space {

View file

@ -2598,6 +2598,7 @@ impl SM50Op for OpLd {
fn encode(&self, e: &mut SM50Encoder<'_>) {
assert_eq!(self.stride, OffsetStride::X1);
assert!(self.pred.is_true());
e.set_opcode(match self.access.space {
MemSpace::Global(_) => 0xeed0,
MemSpace::Local => 0xef40,

View file

@ -150,13 +150,13 @@ impl SM70Encoder<'_> {
self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
}
fn set_rev_upred_src(
fn set_rev_pred_src_file(
&mut self,
range: Range<usize>,
not_bit: usize,
src: &Src,
file: RegFile,
) {
let file = RegFile::UPred;
let (not, reg) = match src.src_ref {
SrcRef::True => (false, self.true_reg(file)),
SrcRef::False => (true, self.true_reg(file)),
@ -177,6 +177,24 @@ impl SM70Encoder<'_> {
self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
}
fn set_rev_pred_src(
&mut self,
range: Range<usize>,
not_bit: usize,
src: &Src,
) {
self.set_rev_pred_src_file(range, not_bit, src, RegFile::Pred)
}
fn set_rev_upred_src(
&mut self,
range: Range<usize>,
not_bit: usize,
src: &Src,
) {
self.set_rev_pred_src_file(range, not_bit, src, RegFile::UPred)
}
fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
let mut v = BitMutView::new_subset(self, range);
v.set_field(6..22, cb.offset);
@ -3019,6 +3037,7 @@ impl SM70Op for OpSuAtom {
impl SM70Op for OpLd {
fn legalize(&mut self, b: &mut LegalizeBuilder) {
b.copy_src_if_uniform(&mut self.addr);
b.copy_src_if_uniform(&mut self.pred);
}
fn encode(&self, e: &mut SM70Encoder<'_>) {
@ -3026,10 +3045,16 @@ impl SM70Op for OpLd {
MemSpace::Global(_) => {
e.set_opcode(0x381);
assert_eq!(self.stride, OffsetStride::X1);
if e.sm >= 73 {
e.set_rev_pred_src(64..67, 67, &self.pred);
} else {
assert!(self.pred.is_true());
}
e.set_pred_dst(81..84, &Dst::None);
e.set_mem_access(&self.access);
}
MemSpace::Local => {
assert!(self.pred.is_true());
assert_eq!(self.stride, OffsetStride::X1);
e.set_opcode(0x983);
e.set_field(84..87, 1_u8);
@ -3043,6 +3068,7 @@ impl SM70Op for OpLd {
}
MemSpace::Shared => {
e.set_opcode(0x984);
assert!(self.pred.is_true());
e.set_mem_type(73..76, self.access.mem_type);
assert!(self.access.order == MemOrder::Strong(MemScope::CTA));

View file

@ -1014,9 +1014,11 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
switch (intr->intrinsic) {
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
case nir_intrinsic_load_global_constant: {
nir_def *nir_true = nir_imm_bool(&b, true);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
break;
}
case nir_intrinsic_load_scratch:
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
break;

View file

@ -63,6 +63,7 @@ lower_ldcx_to_global(nir_builder *b, nir_intrinsic_instr *load,
val = nir_load_global_nv(b,
load->def.num_components, load->def.bit_size,
nir_iadd(b, addr, nir_u2u64(b, offset)),
nir_imm_bool(b, true),
.align_mul = nir_intrinsic_align_mul(load),
.align_offset = nir_intrinsic_align_offset(load),
.access = ACCESS_CAN_REORDER,