nir/divergence: add a new mode to cover fused threads on Intel HW

The Intel Gfx12.x generation of GPU has an architecture feature called EU fusion in which 2 subgroups run lock step. A typical case where this happens is a compute shader with 1x1x1 local workgroup size and a dispatch command of 2x1x1. In that case 2 threads will be run in lock step for each of the workgroup. This has been the sources of some troubles in the backend because one subgroup can run with all lanes disabled, requiring care for SEND messages using the NoMask flag (execution regardless of the lane mask). We found out that other things are happening when 2 subgroups run together : - the HW will use the surface/sampler handle from only one subgroup - the HW will use the sampler header from only one subgroup So one of the fused subgroup can access the wrong surface/sampler if the value is different between the 2 subgroups and that can happen even with subgroup uniform values. Fortunately we can flag SEND instructions to disable the fusion behavior (most likely at a performance cost). This change introduce a new divergence mode that tries to compute things divergent between subgroups so that we can flag instructions accordingly. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37394>
2025-12-20 16:00:08 +01:00 · 2025-10-07 15:32:11 +03:00 · 2025-10-07 15:32:11 +03:00 · ca1533cd03
commit ca1533cd03
parent 79923115e7
2 changed files with 24 additions and 5 deletions
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -206,7 +206,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
       * subgroups, so subgroup ops are always divergent between vertices of
       * the same primitive.
       */
-      is_divergent = state->options & nir_divergence_vertex;
+      is_divergent = (state->options & nir_divergence_vertex) ||
                     (state->options & nir_divergence_across_subgroups);
      break;
   /* Intrinsics which are always uniform */
@ -398,6 +399,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      } else {
         is_divergent = true;
      }
      if (options & nir_divergence_across_subgroups)
         is_divergent = true;
      break;
   case nir_intrinsic_load_attribute_pan:
      assert(stage == MESA_SHADER_VERTEX);
@ -414,6 +417,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
         is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
      else
         is_divergent = true;
      if (options & nir_divergence_across_subgroups)
         is_divergent = true;
      break;
   case nir_intrinsic_load_input_vertex:
      is_divergent = src_divergent(instr->src[1], state);
@ -530,7 +535,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
          * vertices of the same primitive because they may be in
          * different subgroups.
          */
-         is_divergent = state->options & nir_divergence_vertex;
+         is_divergent = (state->options & nir_divergence_vertex) ||
                        (state->options & nir_divergence_across_subgroups);
         break;
      }
      FALLTHROUGH;
@ -538,7 +544,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_inclusive_scan_clusters_ir3: {
      nir_op op = nir_intrinsic_reduction_op(instr);
      is_divergent = src_divergent(instr->src[0], state) ||
-                     state->options & nir_divergence_vertex;
+                     (state->options & nir_divergence_vertex) ||
                     (state->options & nir_divergence_across_subgroups);
      if (op != nir_op_umin && op != nir_op_imin && op != nir_op_fmin &&
          op != nir_op_umax && op != nir_op_imax && op != nir_op_fmax &&
          op != nir_op_iand && op != nir_op_ior)
@ -550,7 +557,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      /* This reduces the last invocations in all 8-wide clusters. It should
       * behave the same as reduce with cluster_size == subgroup_size.
       */
-      is_divergent = state->options & nir_divergence_vertex;
+      is_divergent = (state->options & nir_divergence_vertex) ||
                     (state->options & nir_divergence_across_subgroups);
      break;
   case nir_intrinsic_load_ubo:
@ -749,9 +757,13 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      /* Not having the non_uniform flag with divergent sources is undefined
       * behavior. The Intel driver defines it pick the lowest numbered live
       * SIMD lane (via emit_uniformize).
       *
       * When gather the divergence across subgroups, we need propagate the
       * divergence from the sources.
       */
      if ((nir_intrinsic_resource_access_intel(instr) &
-           nir_resource_intel_non_uniform) != 0) {
+           nir_resource_intel_non_uniform) != 0 ||
          (state->options & nir_divergence_across_subgroups)) {
         unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
         for (unsigned i = 0; i < num_srcs; i++) {
            if (src_divergent(instr->src[i], state)) {
--- a/src/compiler/nir/nir_shader_compiler_options.h
+++ b/src/compiler/nir/nir_shader_compiler_options.h
@ -77,12 +77,19 @@ typedef enum {
   nir_divergence_uniform_load_tears = (1 << 7),
   /* If used, this allows phis for divergent merges with undef and a uniform source to be considered uniform */
   nir_divergence_ignore_undef_if_phi_srcs = (1 << 8),
   /* Whether to compute vertex divergence (meaning between vertices
    * of the same primitive) instead of subgroup invocation divergence
    * (between invocations of the same subgroup). For example, patch input
    * loads are always convergent, while subgroup intrinsics are divergent.
    */
   nir_divergence_vertex = (1 << 11),
   /* Whether to compute divergence of subgroup operations as if multiple
    * subgroups ran in lock-step (for example subgroup operations normally
    * convergent are divergent).
    */
   nir_divergence_across_subgroups = (1 << 12),
 } nir_divergence_options;
 /** An instruction filtering callback