nir/divergence: add a new mode to cover fused threads on Intel HW

The Intel Gfx12.x generation of GPU has an architecture feature called
EU fusion in which 2 subgroups run lock step. A typical case where
this happens is a compute shader with 1x1x1 local workgroup size and a
dispatch command of 2x1x1. In that case 2 threads will be run in lock
step for each of the workgroup.

This has been the sources of some troubles in the backend because one
subgroup can run with all lanes disabled, requiring care for SEND
messages using the NoMask flag (execution regardless of the lane mask).

We found out that other things are happening when 2 subgroups run
together :
  - the HW will use the surface/sampler handle from only one subgroup
  - the HW will use the sampler header from only one subgroup

So one of the fused subgroup can access the wrong surface/sampler if
the value is different between the 2 subgroups and that can happen
even with subgroup uniform values.

Fortunately we can flag SEND instructions to disable the fusion
behavior (most likely at a performance cost).

This change introduce a new divergence mode that tries to compute
things divergent between subgroups so that we can flag instructions
accordingly.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37394>
This commit is contained in:
Lionel Landwerlin 2025-10-07 15:32:11 +03:00 committed by Marge Bot
parent 79923115e7
commit ca1533cd03
2 changed files with 24 additions and 5 deletions

View file

@ -206,7 +206,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
* subgroups, so subgroup ops are always divergent between vertices of * subgroups, so subgroup ops are always divergent between vertices of
* the same primitive. * the same primitive.
*/ */
is_divergent = state->options & nir_divergence_vertex; is_divergent = (state->options & nir_divergence_vertex) ||
(state->options & nir_divergence_across_subgroups);
break; break;
/* Intrinsics which are always uniform */ /* Intrinsics which are always uniform */
@ -398,6 +399,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
} else { } else {
is_divergent = true; is_divergent = true;
} }
if (options & nir_divergence_across_subgroups)
is_divergent = true;
break; break;
case nir_intrinsic_load_attribute_pan: case nir_intrinsic_load_attribute_pan:
assert(stage == MESA_SHADER_VERTEX); assert(stage == MESA_SHADER_VERTEX);
@ -414,6 +417,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup); is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
else else
is_divergent = true; is_divergent = true;
if (options & nir_divergence_across_subgroups)
is_divergent = true;
break; break;
case nir_intrinsic_load_input_vertex: case nir_intrinsic_load_input_vertex:
is_divergent = src_divergent(instr->src[1], state); is_divergent = src_divergent(instr->src[1], state);
@ -530,7 +535,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
* vertices of the same primitive because they may be in * vertices of the same primitive because they may be in
* different subgroups. * different subgroups.
*/ */
is_divergent = state->options & nir_divergence_vertex; is_divergent = (state->options & nir_divergence_vertex) ||
(state->options & nir_divergence_across_subgroups);
break; break;
} }
FALLTHROUGH; FALLTHROUGH;
@ -538,7 +544,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_inclusive_scan_clusters_ir3: { case nir_intrinsic_inclusive_scan_clusters_ir3: {
nir_op op = nir_intrinsic_reduction_op(instr); nir_op op = nir_intrinsic_reduction_op(instr);
is_divergent = src_divergent(instr->src[0], state) || is_divergent = src_divergent(instr->src[0], state) ||
state->options & nir_divergence_vertex; (state->options & nir_divergence_vertex) ||
(state->options & nir_divergence_across_subgroups);
if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin && if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax && op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
op != nir_op_iand && op != nir_op_ior) op != nir_op_iand && op != nir_op_ior)
@ -550,7 +557,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
/* This reduces the last invocations in all 8-wide clusters. It should /* This reduces the last invocations in all 8-wide clusters. It should
* behave the same as reduce with cluster_size == subgroup_size. * behave the same as reduce with cluster_size == subgroup_size.
*/ */
is_divergent = state->options & nir_divergence_vertex; is_divergent = (state->options & nir_divergence_vertex) ||
(state->options & nir_divergence_across_subgroups);
break; break;
case nir_intrinsic_load_ubo: case nir_intrinsic_load_ubo:
@ -749,9 +757,13 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
/* Not having the non_uniform flag with divergent sources is undefined /* Not having the non_uniform flag with divergent sources is undefined
* behavior. The Intel driver defines it pick the lowest numbered live * behavior. The Intel driver defines it pick the lowest numbered live
* SIMD lane (via emit_uniformize). * SIMD lane (via emit_uniformize).
*
* When gather the divergence across subgroups, we need propagate the
* divergence from the sources.
*/ */
if ((nir_intrinsic_resource_access_intel(instr) & if ((nir_intrinsic_resource_access_intel(instr) &
nir_resource_intel_non_uniform) != 0) { nir_resource_intel_non_uniform) != 0 ||
(state->options & nir_divergence_across_subgroups)) {
unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs; unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
for (unsigned i = 0; i < num_srcs; i++) { for (unsigned i = 0; i < num_srcs; i++) {
if (src_divergent(instr->src[i], state)) { if (src_divergent(instr->src[i], state)) {

View file

@ -77,12 +77,19 @@ typedef enum {
nir_divergence_uniform_load_tears = (1 << 7), nir_divergence_uniform_load_tears = (1 << 7),
/* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */ /* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
nir_divergence_ignore_undef_if_phi_srcs = (1 << 8), nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
/* Whether to compute vertex divergence (meaning between vertices /* Whether to compute vertex divergence (meaning between vertices
* of the same primitive) instead of subgroup invocation divergence * of the same primitive) instead of subgroup invocation divergence
* (between invocations of the same subgroup). For example, patch input * (between invocations of the same subgroup). For example, patch input
* loads are always convergent, while subgroup intrinsics are divergent. * loads are always convergent, while subgroup intrinsics are divergent.
*/ */
nir_divergence_vertex = (1 << 11), nir_divergence_vertex = (1 << 11),
/* Whether to compute divergence of subgroup operations as if multiple
* subgroups ran in lock-step (for example subgroup operations normally
* convergent are divergent).
*/
nir_divergence_across_subgroups = (1 << 12),
} nir_divergence_options; } nir_divergence_options;
/** An instruction filtering callback /** An instruction filtering callback