nir: add shuffle_intel

Jay will use this to lower & optimize subgroup shuffles. This is closer to
how Intel hardware works but still much higher level than the hardware
primitive. This gets us NIR optimizations on the multiply however.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
This commit is contained in:
Alyssa Rosenzweig 2026-02-24 11:53:17 -05:00 committed by Marge Bot
parent b840b178af
commit bd6d210386
2 changed files with 5 additions and 0 deletions

View file

@ -823,6 +823,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
}
case nir_intrinsic_shuffle:
case nir_intrinsic_shuffle_intel:
is_divergent = src_divergent(instr->src[0], state) &&
src_divergent(instr->src[1], state);
break;

View file

@ -2622,6 +2622,10 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
# src[] = { payload, 2x32 descriptor, predicate }
intrinsic("store_render_target_intel", [-1, 2, 1], indices=[EOT], bit_sizes=[32])
# Shuffle with an offset in bytes instead of a lane index.
# src[] = { payload, lane offset in bytes }
intrinsic("shuffle_intel", src_comp=[1, 1], dest_comp=0, bit_sizes=src0, flags=SUBGROUP_FLAGS)
# 1 component 32bit surface index that can be used for bindless or BTI heaps
#
# This intrinsic is used to figure out what UBOs accesses could be promoted to