nir,nak: Add match_any_nv

NVIDIA hardware have an instruction allowering you to retrive the mask
of active threads matching the same source value as the current
invocation.

This is going to be used by shared memory lowering for mesh / task
stages on NVK.

Signed-off-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Mel Henning <mhenning@darkrefraction.com>
Tested-by: Thomas H.P. Andersen <phomes@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27196>
This commit is contained in:
Mary Guillemard 2026-03-04 16:41:00 +01:00 committed by Marge Bot
parent d88c183785
commit b95dbc64bf
3 changed files with 23 additions and 0 deletions

View file

@ -1063,6 +1063,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_agx:
case nir_intrinsic_load_shared_lock_nv:
case nir_intrinsic_store_shared_unlock_nv:
case nir_intrinsic_match_any_nv:
case nir_intrinsic_bvh_stack_rtn_amd:
case nir_intrinsic_cmat_load_shared_nv:
case nir_intrinsic_cmat_mov_transpose_nv:

View file

@ -2948,6 +2948,8 @@ intrinsic("ipa_nv", dest_comp=1, src_comp=[1, 1], bit_sizes=[32],
# FLAGS indicate if we load vertex_id == 2
intrinsic("ldtram_nv", dest_comp=2, bit_sizes=[32],
indices=[BASE, FLAGS], flags=[CAN_ELIMINATE, CAN_REORDER])
# Gives the mask of active threads matching the same source value
intrinsic("match_any_nv", src_comp=[0], dest_comp=1, flags=SUBGROUP_FLAGS)
# NVIDIA-specific Image intrinsics
# only used for kepler address calculations.

View file

@ -3901,6 +3901,26 @@ impl<'a> ShaderFromNir<'a> {
}
self.set_dst(&intrin.def, dst.into());
}
nir_intrinsic_match_any_nv => {
let src = self.get_src(&srcs[0]);
let src_bits = srcs[0].bit_size() * srcs[0].num_components();
assert!(
intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
);
let dst = b.alloc_ssa(RegFile::GPR);
b.push_op(OpMatch {
op: MatchOp::Any,
mask: dst.into(),
pred: Dst::None,
src,
u64: match src_bits {
32 => false,
64 => true,
_ => panic!("Unsupported vote_ieq bit size"),
},
});
self.set_dst(&intrin.def, dst.into());
}
nir_intrinsic_is_sparse_texels_resident => {
let src = self.get_src(&srcs[0]);
let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());