From 0bc87e47e26266872ee733b310fd52a7cddedf4f Mon Sep 17 00:00:00 2001
From: squidbus <gh.thud602@passmail.net>
Date: Sun, 26 Apr 2026 15:44:32 -0700
Subject: [PATCH] kk: Expand workaround 3 to cover general use of ballot/vote
 ops

simd_ballot/quad_any/quad_all (and probably simd_any/simd_all) appear to
generally be broken within conditional blocks, not just with simd_is_first.

Reviewed-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41186>
---
 docs/drivers/kosmickrisp/workarounds.rst      | 39 +++++++++++---
 src/kosmickrisp/clc/kk_clc.c                  |  1 +
 .../compiler/msl_nir_lower_subgroups.c        | 29 +----------
 src/kosmickrisp/compiler/msl_type_inference.c |  2 +
 src/kosmickrisp/compiler/nir_to_msl.c         | 51 ++++++++++++++++---
 src/kosmickrisp/compiler/nir_to_msl.h         |  5 ++
 src/kosmickrisp/kosmicomp.c                   |  1 +
 src/kosmickrisp/vulkan/kk_shader.c            |  1 +
 8 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/docs/drivers/kosmickrisp/workarounds.rst b/docs/drivers/kosmickrisp/workarounds.rst
index cface470f4f..2eec9797dcb 100644
--- a/docs/drivers/kosmickrisp/workarounds.rst
+++ b/docs/drivers/kosmickrisp/workarounds.rst
@@ -207,11 +207,20 @@ KK_WORKAROUND_3
 | macOS version: 15.4.x
 | Metal ticket: FB20113490 (@aitor)
 | Metal ticket status: Waiting resolution
-| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``
+| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``, ``dEQP-VK.subgroups.arithmetic.graphics.*``, ``dEQP-VK.subgroups.shader_quad_control.divergent_condition``
 | Comments:
 
-``simd_is_first`` does not seem to behave as documented in the MSL
-specification. The following code snippet misbehaves:
+``simd_ballot`` within a conditional block does not seem to behave as
+documented in the MSL specification. For example, the following code blocks
+misbehave:
+
+.. code-block:: c
+
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute)
+      temp = simd_ballot(true); /* <- This may return all active threads... */
+   else
+      temp = 2u;
 
 .. code-block:: c
 
@@ -220,17 +229,33 @@ specification. The following code snippet misbehaves:
    else
       temp = simd_ballot(true); /* <- This will return all active threads... */
 
-The way to fix this is by changing the conditional to:
+This appears to also apply to ``quad_any`` and ``quad_all``, and likely the
+``simd`` equivalents as well.
+
+The way to fix this is to use ``simd_or`` instead:
 
 .. code-block:: c
 
-   if (simd_is_first() && (ulong)simd_ballot(true))
-      temp = 3u;
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute)
+      temp = simd_or(1 << gl_SubGroupInvocation);
    else
-      temp = (ulong)simd_ballot(true);
+      temp = 2u;
+
+Alternatively, the conditional can be changed to include ``simd_ballot(true)``:
+
+.. code-block:: c
+
+   bool execute = (gl_SubGroupInvocation & 1u) != 0u;
+   if (execute && (ulong)simd_ballot(true))
+      temp = simd_ballot(true);
+   else
+      temp = 2u;
+
 
 | Log:
 | 2025-09-09: Workaround implemented and reported to Apple
+| 2026-04-28: Workaround updated to expand to all ballot/vote ops.
 
 KK_WORKAROUND_2
 ---------------
diff --git a/src/kosmickrisp/clc/kk_clc.c b/src/kosmickrisp/clc/kk_clc.c
index b2ad7ca1dcd..28b45a14d52 100644
--- a/src/kosmickrisp/clc/kk_clc.c
+++ b/src/kosmickrisp/clc/kk_clc.c
@@ -282,6 +282,7 @@ main(int argc, char **argv)
                   nir_address_format_62bit_generic);
 
          msl_preprocess_nir(s);
+         msl_preprocess_nir_workarounds(nir, 0);
          msl_optimize_nir(nir);
 
          NIR_PASS(_, s, nir_opt_deref);
diff --git a/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c b/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c
index 30201145c4a..ed48cf65b52 100644
--- a/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c
+++ b/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c
@@ -50,30 +50,6 @@ lower_bool_ops(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
    return true;
 }
 
-static bool
-lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   b->cursor = nir_before_instr(&intr->instr);
-
-   switch (intr->intrinsic) {
-   case nir_intrinsic_vote_any: {
-      /* We don't have vote instructions, but we have efficient ballots */
-      nir_def *ballot = nir_ballot(b, 1, 32, intr->src[0].ssa);
-      nir_def_rewrite_uses(&intr->def, nir_ine_imm(b, ballot, 0));
-      return true;
-   }
-
-   case nir_intrinsic_vote_all: {
-      nir_def *ballot = nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa));
-      nir_def_rewrite_uses(&intr->def, nir_ieq_imm(b, ballot, 0));
-      return true;
-   }
-
-   default:
-      return false;
-   }
-}
-
 void
 msl_nir_lower_subgroups(nir_shader *nir)
 {
@@ -86,13 +62,12 @@ msl_nir_lower_subgroups(nir_shader *nir)
       .lower_vote_feq = true,
       .lower_vote_bool_eq = true,
       .lower_inverse_ballot = true,
+      /* Metal requires relative shuffle operations to have uniform delta */
       .lower_relative_shuffle = true,
-      .lower_quad = true,
+      /* Metal reduce operations do not support certain types or cluster size */
       .lower_reduce = true,
    };
    NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
-   NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower,
-            nir_metadata_control_flow, NULL);
    NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_bool_ops,
             nir_metadata_control_flow, NULL);
 }
diff --git a/src/kosmickrisp/compiler/msl_type_inference.c b/src/kosmickrisp/compiler/msl_type_inference.c
index 4280a02ff61..30e3b472785 100644
--- a/src/kosmickrisp/compiler/msl_type_inference.c
+++ b/src/kosmickrisp/compiler/msl_type_inference.c
@@ -177,6 +177,8 @@ update_instr_type(struct hash_table *types, nir_instr *instr, ti_type type)
          return true;
       case nir_intrinsic_read_first_invocation:
       case nir_intrinsic_read_invocation:
+      case nir_intrinsic_quad_vote_all:
+      case nir_intrinsic_quad_vote_any:
       case nir_intrinsic_quad_broadcast:
       case nir_intrinsic_quad_swap_horizontal:
       case nir_intrinsic_quad_swap_vertical:
diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c
index ca9a40c02ee..968ebdc1122 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@@ -7,6 +7,7 @@
 #include "nir_to_msl.h"
 #include "msl_private.h"
 #include "nir.h"
+#include "nir_builder.h"
 
 static const char *
 get_stage_string(mesa_shader_stage stage)
@@ -1456,12 +1457,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
       P(ctx, ");\n");
       break;
    case nir_intrinsic_elect:
-      /* KK_WORKAROUND_3 */
-      if (ctx->disabled_workarounds & BITFIELD64_BIT(3)) {
-         P(ctx, "simd_is_first();\n");
-      } else {
-         P(ctx, "simd_is_first() && (ulong)simd_ballot(true);\n");
-      }
+      P(ctx, "simd_is_first();\n");
       break;
    case nir_intrinsic_read_first_invocation:
       P(ctx, "simd_broadcast_first(");
@@ -1514,6 +1510,16 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
       src_to_msl(ctx, &instr->src[0]);
       P(ctx, ");\n");
       break;
+   case nir_intrinsic_quad_vote_all:
+      P(ctx, "quad_all(");
+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
+   case nir_intrinsic_quad_vote_any:
+      P(ctx, "quad_any(");
+      src_to_msl(ctx, &instr->src[0]);
+      P(ctx, ");\n");
+      break;
    case nir_intrinsic_quad_broadcast:
       P(ctx, "quad_broadcast(");
       src_to_msl(ctx, &instr->src[0]);
@@ -2057,6 +2063,39 @@ msl_optimize_nir(struct nir_shader *nir)
    return progress;
 }
 
+static bool
+lower_ballot(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
+{
+   if (intrin->intrinsic != nir_intrinsic_ballot)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_def* invocation = nir_load_subgroup_invocation(b);
+   nir_def* mask = nir_ishl(b, nir_b2i32(b, intrin->src[0].ssa), invocation);
+   nir_def* reduce = nir_reduce(b, mask, .reduction_op = nir_op_ior);
+   nir_def_rewrite_uses(&intrin->def, reduce);
+
+   return true;
+}
+
+void msl_preprocess_nir_workarounds(struct nir_shader *nir,
+                                    uint64_t disabled_workarounds)
+{
+   /* KK_WORKAROUND_3 */
+   if (!(disabled_workarounds & BITFIELD64_BIT(3))) {
+      const nir_lower_subgroups_options subgroups_options = {
+         .subgroup_size = 32,
+         .ballot_bit_size = 32,
+         .ballot_components = 1,
+         .lower_vote = true,
+         .lower_quad_vote = true,
+      };
+      NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_ballot,
+               nir_metadata_control_flow, NULL);
+   }
+}
+
 /* Scalarize stores to CLIP_DIST* varyings */
 static bool
 scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin,
diff --git a/src/kosmickrisp/compiler/nir_to_msl.h b/src/kosmickrisp/compiler/nir_to_msl.h
index b5542de5ffd..54f0f5d52f5 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.h
+++ b/src/kosmickrisp/compiler/nir_to_msl.h
@@ -31,6 +31,11 @@ bool msl_optimize_nir(struct nir_shader *nir);
 /* Call this before all API-speicific lowerings, it will */
 void msl_preprocess_nir(struct nir_shader *nir);
 
+/* Call this before all API-specific lowerings. It will pre-process with
+ * instruction workarounds based on the disabled workarounds bitmask. */
+void msl_preprocess_nir_workarounds(struct nir_shader *nir,
+                                    uint64_t disabled_workarounds);
+
 enum msl_tex_access_flag {
    MSL_ACCESS_SAMPLE = 0,
    MSL_ACCESS_READ,
diff --git a/src/kosmickrisp/kosmicomp.c b/src/kosmickrisp/kosmicomp.c
index 41dceb33048..adccae1f3c0 100644
--- a/src/kosmickrisp/kosmicomp.c
+++ b/src/kosmickrisp/kosmicomp.c
@@ -69,6 +69,7 @@ static void
 optimize(nir_shader *nir)
 {
    msl_preprocess_nir(nir);
+   msl_preprocess_nir_workarounds(nir, 0);
 
    nir_lower_compute_system_values_options csv_options = {
       .has_base_global_invocation_id = 0,
diff --git a/src/kosmickrisp/vulkan/kk_shader.c b/src/kosmickrisp/vulkan/kk_shader.c
index 7882edf1662..19a3728afe1 100644
--- a/src/kosmickrisp/vulkan/kk_shader.c
+++ b/src/kosmickrisp/vulkan/kk_shader.c
@@ -1121,6 +1121,7 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
       const struct vk_shader_compile_info *info = &infos[i];
       nir_shader *nir = info->nir;
 
+      msl_preprocess_nir_workarounds(nir, dev->disabled_workarounds);
       kk_lower_nir(dev, nir, info->robustness, info->set_layout_count,
                    info->set_layouts, state);