From bd6d21038674523651195692ea7e4a94c405bfc0 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 24 Feb 2026 11:53:17 -0500 Subject: [PATCH] nir: add shuffle_intel Jay will use this to lower & optimize subgroup shuffles. This is closer to how Intel hardware works but still much higher level than the hardware primitive. This gets us NIR optimizations on the multiply however. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index ed546a6f782..63fd6a1ba66 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -823,6 +823,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) } case nir_intrinsic_shuffle: + case nir_intrinsic_shuffle_intel: is_divergent = src_divergent(instr->src[0], state) && src_divergent(instr->src[1], state); break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 8f2a3ccb16a..395e285ae47 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2622,6 +2622,10 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32], # src[] = { payload, 2x32 descriptor, predicate } intrinsic("store_render_target_intel", [-1, 2, 1], indices=[EOT], bit_sizes=[32]) +# Shuffle with an offset in bytes instead of a lane index. +# src[] = { payload, lane offset in bytes } +intrinsic("shuffle_intel", src_comp=[1, 1], dest_comp=0, bit_sizes=src0, flags=SUBGROUP_FLAGS) + # 1 component 32bit surface index that can be used for bindless or BTI heaps # # This intrinsic is used to figure out what UBOs accesses could be promoted to