kk: Expand workaround 3 to cover general use of ballot/vote ops

simd_ballot/quad_any/quad_all (and probably simd_any/simd_all) appear to generally be broken within conditional blocks, not just with simd_is_first. Reviewed-by: Aitor Camacho <aitor@lunarg.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41186>
2026-05-15 12:08:14 +02:00 · 2026-04-26 15:44:32 -07:00 · 2026-04-26 15:44:32 -07:00 · 0bc87e47e2
commit 0bc87e47e2
parent 5b34d1ff34
8 changed files with 89 additions and 40 deletions
--- a/docs/drivers/kosmickrisp/workarounds.rst
+++ b/docs/drivers/kosmickrisp/workarounds.rst
@ -207,11 +207,20 @@ KK_WORKAROUND_3
 | macOS version: 15.4.x
 | Metal ticket: FB20113490 (@aitor)
 | Metal ticket status: Waiting resolution
-| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``
+| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``, ``dEQP-VK.subgroups.arithmetic.graphics.*``, ``dEQP-VK.subgroups.shader_quad_control.divergent_condition``
 | Comments:

-``simd_is_first`` does not seem to behave as documented in the MSL
-specification. The following code snippet misbehaves:
+``simd_ballot`` within a conditional block does not seem to behave as
+documented in the MSL specification. For example, the following code blocks
+misbehave:
+
+.. code-block:: c
+
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute)
+      temp = simd_ballot(true); /* <- This may return all active threads... */
+   else
+      temp = 2u;

 .. code-block:: c

@ -220,17 +229,33 @@ specification. The following code snippet misbehaves:
   else
      temp = simd_ballot(true); /* <- This will return all active threads... */

-The way to fix this is by changing the conditional to:
+This appears to also apply to ``quad_any`` and ``quad_all``, and likely the
+``simd`` equivalents as well.
+
+The way to fix this is to use ``simd_or`` instead:

 .. code-block:: c

-   if (simd_is_first() && (ulong)simd_ballot(true))
-      temp = 3u;
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute)
+      temp = simd_or(1 << gl_SubGroupInvocation);
   else
-      temp = (ulong)simd_ballot(true);
+      temp = 2u;
+
+Alternatively, the conditional can be changed to include ``simd_ballot(true)``:
+
+.. code-block:: c
+
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute && (ulong)simd_ballot(true))
+      temp = simd_ballot(true);
+   else
+      temp = 2u;
+

 | Log:
 | 2025-09-09: Workaround implemented and reported to Apple
+| 2026-04-28: Workaround updated to expand to all ballot/vote ops.

 KK_WORKAROUND_2
 ---------------
--- a/src/kosmickrisp/clc/kk_clc.c
+++ b/src/kosmickrisp/clc/kk_clc.c
@ -282,6 +282,7 @@ main(int argc, char **argv)
                  nir_address_format_62bit_generic);

         msl_preprocess_nir(s);
+         msl_preprocess_nir_workarounds(nir, 0);
         msl_optimize_nir(nir);

         NIR_PASS(_, s, nir_opt_deref);
--- a/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c
+++ b/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c
@ -50,30 +50,6 @@ lower_bool_ops(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
   return true;
 }

-static bool
-lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   b->cursor = nir_before_instr(&intr->instr);
-
-   switch (intr->intrinsic) {
-   case nir_intrinsic_vote_any: {
-      /* We don't have vote instructions, but we have efficient ballots */
-      nir_def *ballot = nir_ballot(b, 1, 32, intr->src[0].ssa);
-      nir_def_rewrite_uses(&intr->def, nir_ine_imm(b, ballot, 0));
-      return true;
-   }
-
-   case nir_intrinsic_vote_all: {
-      nir_def *ballot = nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa));
-      nir_def_rewrite_uses(&intr->def, nir_ieq_imm(b, ballot, 0));
-      return true;
-   }
-
-   default:
-      return false;
-   }
-}
-
 void
 msl_nir_lower_subgroups(nir_shader *nir)
 {
@ -86,13 +62,12 @@ msl_nir_lower_subgroups(nir_shader *nir)
      .lower_vote_feq = true,
      .lower_vote_bool_eq = true,
      .lower_inverse_ballot = true,
+      /* Metal requires relative shuffle operations to have uniform delta */
      .lower_relative_shuffle = true,
-      .lower_quad = true,
+      /* Metal reduce operations do not support certain types or cluster size */
      .lower_reduce = true,
   };
   NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
-   NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower,
-            nir_metadata_control_flow, NULL);
   NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_bool_ops,
            nir_metadata_control_flow, NULL);
 }
--- a/src/kosmickrisp/compiler/msl_type_inference.c
+++ b/src/kosmickrisp/compiler/msl_type_inference.c
@ -177,6 +177,8 @@ update_instr_type(struct hash_table *types, nir_instr *instr, ti_type type)
         return true;
      case nir_intrinsic_read_first_invocation:
      case nir_intrinsic_read_invocation:
+      case nir_intrinsic_quad_vote_all:
+      case nir_intrinsic_quad_vote_any:
      case nir_intrinsic_quad_broadcast:
      case nir_intrinsic_quad_swap_horizontal:
      case nir_intrinsic_quad_swap_vertical:
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@ -7,6 +7,7 @@
 #include "nir_to_msl.h"
 #include "msl_private.h"
 #include "nir.h"
+#include "nir_builder.h"

 static const char *
 get_stage_string(mesa_shader_stage stage)
@ -1456,12 +1457,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
      P(ctx, ");\n");
      break;
   case nir_intrinsic_elect:
-      /* KK_WORKAROUND_3 */
-      if (ctx->disabled_workarounds & BITFIELD64_BIT(3)) {
-         P(ctx, "simd_is_first();\n");
-      } else {
-         P(ctx, "simd_is_first() && (ulong)simd_ballot(true);\n");
-      }
+      P(ctx, "simd_is_first();\n");
      break;
   case nir_intrinsic_read_first_invocation:
      P(ctx, "simd_broadcast_first(");
@ -1514,6 +1510,16 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
      src_to_msl(ctx, &instr->src[0]);
      P(ctx, ");\n");
      break;
+   case nir_intrinsic_quad_vote_all:
+      P(ctx, "quad_all(");
+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
+   case nir_intrinsic_quad_vote_any:
+      P(ctx, "quad_any(");
+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
   case nir_intrinsic_quad_broadcast:
      P(ctx, "quad_broadcast(");
      src_to_msl(ctx, &instr->src[0]);
@ -2057,6 +2063,39 @@ msl_optimize_nir(struct nir_shader *nir)
   return progress;
 }

+static bool
+lower_ballot(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
+{
+   if (intrin->intrinsic != nir_intrinsic_ballot)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_def* invocation = nir_load_subgroup_invocation(b);
+   nir_def* mask = nir_ishl(b, nir_b2i32(b, intrin->src[0].ssa), invocation);
+   nir_def* reduce = nir_reduce(b, mask, .reduction_op = nir_op_ior);
+   nir_def_rewrite_uses(&intrin->def, reduce);
+
+   return true;
+}
+
+void msl_preprocess_nir_workarounds(struct nir_shader *nir,
+                                    uint64_t disabled_workarounds)
+{
+   /* KK_WORKAROUND_3 */
+   if (!(disabled_workarounds & BITFIELD64_BIT(3))) {
+      const nir_lower_subgroups_options subgroups_options = {
+         .subgroup_size = 32,
+         .ballot_bit_size = 32,
+         .ballot_components = 1,
+         .lower_vote = true,
+         .lower_quad_vote = true,
+      };
+      NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_ballot,
+               nir_metadata_control_flow, NULL);
+   }
+}
+
 /* Scalarize stores to CLIP_DIST* varyings */
 static bool
 scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin,
--- a/src/kosmickrisp/compiler/nir_to_msl.h
+++ b/src/kosmickrisp/compiler/nir_to_msl.h
@ -31,6 +31,11 @@ bool msl_optimize_nir(struct nir_shader *nir);
 /* Call this before all API-speicific lowerings, it will */
 void msl_preprocess_nir(struct nir_shader *nir);

+/* Call this before all API-specific lowerings. It will pre-process with
+ * instruction workarounds based on the disabled workarounds bitmask. */
+void msl_preprocess_nir_workarounds(struct nir_shader *nir,
+                                    uint64_t disabled_workarounds);
+
 enum msl_tex_access_flag {
   MSL_ACCESS_SAMPLE = 0,
   MSL_ACCESS_READ,
--- a/src/kosmickrisp/kosmicomp.c
+++ b/src/kosmickrisp/kosmicomp.c
@ -69,6 +69,7 @@ static void
 optimize(nir_shader *nir)
 {
   msl_preprocess_nir(nir);
+   msl_preprocess_nir_workarounds(nir, 0);

   nir_lower_compute_system_values_options csv_options = {
      .has_base_global_invocation_id = 0,
--- a/src/kosmickrisp/vulkan/kk_shader.c
+++ b/src/kosmickrisp/vulkan/kk_shader.c
@ -1121,6 +1121,7 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
      const struct vk_shader_compile_info *info = &infos[i];
      nir_shader *nir = info->nir;

+      msl_preprocess_nir_workarounds(nir, dev->disabled_workarounds);
      kk_lower_nir(dev, nir, info->robustness, info->set_layout_count,
                   info->set_layouts, state);