From 0bc87e47e26266872ee733b310fd52a7cddedf4f Mon Sep 17 00:00:00 2001 From: squidbus Date: Sun, 26 Apr 2026 15:44:32 -0700 Subject: [PATCH] kk: Expand workaround 3 to cover general use of ballot/vote ops simd_ballot/quad_any/quad_all (and probably simd_any/simd_all) appear to generally be broken within conditional blocks, not just with simd_is_first. Reviewed-by: Aitor Camacho Part-of: --- docs/drivers/kosmickrisp/workarounds.rst | 39 +++++++++++--- src/kosmickrisp/clc/kk_clc.c | 1 + .../compiler/msl_nir_lower_subgroups.c | 29 +---------- src/kosmickrisp/compiler/msl_type_inference.c | 2 + src/kosmickrisp/compiler/nir_to_msl.c | 51 ++++++++++++++++--- src/kosmickrisp/compiler/nir_to_msl.h | 5 ++ src/kosmickrisp/kosmicomp.c | 1 + src/kosmickrisp/vulkan/kk_shader.c | 1 + 8 files changed, 89 insertions(+), 40 deletions(-) diff --git a/docs/drivers/kosmickrisp/workarounds.rst b/docs/drivers/kosmickrisp/workarounds.rst index cface470f4f..2eec9797dcb 100644 --- a/docs/drivers/kosmickrisp/workarounds.rst +++ b/docs/drivers/kosmickrisp/workarounds.rst @@ -207,11 +207,20 @@ KK_WORKAROUND_3 | macOS version: 15.4.x | Metal ticket: FB20113490 (@aitor) | Metal ticket status: Waiting resolution -| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb`` +| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``, ``dEQP-VK.subgroups.arithmetic.graphics.*``, ``dEQP-VK.subgroups.shader_quad_control.divergent_condition`` | Comments: -``simd_is_first`` does not seem to behave as documented in the MSL -specification. The following code snippet misbehaves: +``simd_ballot`` within a conditional block does not seem to behave as +documented in the MSL specification. For example, the following code blocks +misbehave: + +.. code-block:: c + + bool execute = (gl_SubGroupInvocation & 1u) != 0u; + if (execute) + temp = simd_ballot(true); /* <- This may return all active threads... */ + else + temp = 2u; .. code-block:: c @@ -220,17 +229,33 @@ specification. The following code snippet misbehaves: else temp = simd_ballot(true); /* <- This will return all active threads... */ -The way to fix this is by changing the conditional to: +This appears to also apply to ``quad_any`` and ``quad_all``, and likely the +``simd`` equivalents as well. + +The way to fix this is to use ``simd_or`` instead: .. code-block:: c - if (simd_is_first() && (ulong)simd_ballot(true)) - temp = 3u; + bool execute = (gl_SubGroupInvocation & 1u) != 0u; + if (execute) + temp = simd_or(1 << gl_SubGroupInvocation); else - temp = (ulong)simd_ballot(true); + temp = 2u; + +Alternatively, the conditional can be changed to include ``simd_ballot(true)``: + +.. code-block:: c + + bool execute = (gl_SubGroupInvocation & 1u) != 0u; + if (execute && (ulong)simd_ballot(true)) + temp = simd_ballot(true); + else + temp = 2u; + | Log: | 2025-09-09: Workaround implemented and reported to Apple +| 2026-04-28: Workaround updated to expand to all ballot/vote ops. KK_WORKAROUND_2 --------------- diff --git a/src/kosmickrisp/clc/kk_clc.c b/src/kosmickrisp/clc/kk_clc.c index b2ad7ca1dcd..28b45a14d52 100644 --- a/src/kosmickrisp/clc/kk_clc.c +++ b/src/kosmickrisp/clc/kk_clc.c @@ -282,6 +282,7 @@ main(int argc, char **argv) nir_address_format_62bit_generic); msl_preprocess_nir(s); + msl_preprocess_nir_workarounds(nir, 0); msl_optimize_nir(nir); NIR_PASS(_, s, nir_opt_deref); diff --git a/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c b/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c index 30201145c4a..ed48cf65b52 100644 --- a/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c +++ b/src/kosmickrisp/compiler/msl_nir_lower_subgroups.c @@ -50,30 +50,6 @@ lower_bool_ops(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused) return true; } -static bool -lower(nir_builder *b, nir_intrinsic_instr *intr, void *data) -{ - b->cursor = nir_before_instr(&intr->instr); - - switch (intr->intrinsic) { - case nir_intrinsic_vote_any: { - /* We don't have vote instructions, but we have efficient ballots */ - nir_def *ballot = nir_ballot(b, 1, 32, intr->src[0].ssa); - nir_def_rewrite_uses(&intr->def, nir_ine_imm(b, ballot, 0)); - return true; - } - - case nir_intrinsic_vote_all: { - nir_def *ballot = nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa)); - nir_def_rewrite_uses(&intr->def, nir_ieq_imm(b, ballot, 0)); - return true; - } - - default: - return false; - } -} - void msl_nir_lower_subgroups(nir_shader *nir) { @@ -86,13 +62,12 @@ msl_nir_lower_subgroups(nir_shader *nir) .lower_vote_feq = true, .lower_vote_bool_eq = true, .lower_inverse_ballot = true, + /* Metal requires relative shuffle operations to have uniform delta */ .lower_relative_shuffle = true, - .lower_quad = true, + /* Metal reduce operations do not support certain types or cluster size */ .lower_reduce = true, }; NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options); - NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower, - nir_metadata_control_flow, NULL); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_bool_ops, nir_metadata_control_flow, NULL); } diff --git a/src/kosmickrisp/compiler/msl_type_inference.c b/src/kosmickrisp/compiler/msl_type_inference.c index 4280a02ff61..30e3b472785 100644 --- a/src/kosmickrisp/compiler/msl_type_inference.c +++ b/src/kosmickrisp/compiler/msl_type_inference.c @@ -177,6 +177,8 @@ update_instr_type(struct hash_table *types, nir_instr *instr, ti_type type) return true; case nir_intrinsic_read_first_invocation: case nir_intrinsic_read_invocation: + case nir_intrinsic_quad_vote_all: + case nir_intrinsic_quad_vote_any: case nir_intrinsic_quad_broadcast: case nir_intrinsic_quad_swap_horizontal: case nir_intrinsic_quad_swap_vertical: diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c index ca9a40c02ee..968ebdc1122 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.c +++ b/src/kosmickrisp/compiler/nir_to_msl.c @@ -7,6 +7,7 @@ #include "nir_to_msl.h" #include "msl_private.h" #include "nir.h" +#include "nir_builder.h" static const char * get_stage_string(mesa_shader_stage stage) @@ -1456,12 +1457,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) P(ctx, ");\n"); break; case nir_intrinsic_elect: - /* KK_WORKAROUND_3 */ - if (ctx->disabled_workarounds & BITFIELD64_BIT(3)) { - P(ctx, "simd_is_first();\n"); - } else { - P(ctx, "simd_is_first() && (ulong)simd_ballot(true);\n"); - } + P(ctx, "simd_is_first();\n"); break; case nir_intrinsic_read_first_invocation: P(ctx, "simd_broadcast_first("); @@ -1514,6 +1510,16 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr) src_to_msl(ctx, &instr->src[0]); P(ctx, ");\n"); break; + case nir_intrinsic_quad_vote_all: + P(ctx, "quad_all("); + src_to_msl(ctx, &instr->src[0]); + P(ctx, ");\n"); + break; + case nir_intrinsic_quad_vote_any: + P(ctx, "quad_any("); + src_to_msl(ctx, &instr->src[0]); + P(ctx, ");\n"); + break; case nir_intrinsic_quad_broadcast: P(ctx, "quad_broadcast("); src_to_msl(ctx, &instr->src[0]); @@ -2057,6 +2063,39 @@ msl_optimize_nir(struct nir_shader *nir) return progress; } +static bool +lower_ballot(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused) +{ + if (intrin->intrinsic != nir_intrinsic_ballot) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def* invocation = nir_load_subgroup_invocation(b); + nir_def* mask = nir_ishl(b, nir_b2i32(b, intrin->src[0].ssa), invocation); + nir_def* reduce = nir_reduce(b, mask, .reduction_op = nir_op_ior); + nir_def_rewrite_uses(&intrin->def, reduce); + + return true; +} + +void msl_preprocess_nir_workarounds(struct nir_shader *nir, + uint64_t disabled_workarounds) +{ + /* KK_WORKAROUND_3 */ + if (!(disabled_workarounds & BITFIELD64_BIT(3))) { + const nir_lower_subgroups_options subgroups_options = { + .subgroup_size = 32, + .ballot_bit_size = 32, + .ballot_components = 1, + .lower_vote = true, + .lower_quad_vote = true, + }; + NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_ballot, + nir_metadata_control_flow, NULL); + } +} + /* Scalarize stores to CLIP_DIST* varyings */ static bool scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin, diff --git a/src/kosmickrisp/compiler/nir_to_msl.h b/src/kosmickrisp/compiler/nir_to_msl.h index b5542de5ffd..54f0f5d52f5 100644 --- a/src/kosmickrisp/compiler/nir_to_msl.h +++ b/src/kosmickrisp/compiler/nir_to_msl.h @@ -31,6 +31,11 @@ bool msl_optimize_nir(struct nir_shader *nir); /* Call this before all API-speicific lowerings, it will */ void msl_preprocess_nir(struct nir_shader *nir); +/* Call this before all API-specific lowerings. It will pre-process with + * instruction workarounds based on the disabled workarounds bitmask. */ +void msl_preprocess_nir_workarounds(struct nir_shader *nir, + uint64_t disabled_workarounds); + enum msl_tex_access_flag { MSL_ACCESS_SAMPLE = 0, MSL_ACCESS_READ, diff --git a/src/kosmickrisp/kosmicomp.c b/src/kosmickrisp/kosmicomp.c index 41dceb33048..adccae1f3c0 100644 --- a/src/kosmickrisp/kosmicomp.c +++ b/src/kosmickrisp/kosmicomp.c @@ -69,6 +69,7 @@ static void optimize(nir_shader *nir) { msl_preprocess_nir(nir); + msl_preprocess_nir_workarounds(nir, 0); nir_lower_compute_system_values_options csv_options = { .has_base_global_invocation_id = 0, diff --git a/src/kosmickrisp/vulkan/kk_shader.c b/src/kosmickrisp/vulkan/kk_shader.c index 7882edf1662..19a3728afe1 100644 --- a/src/kosmickrisp/vulkan/kk_shader.c +++ b/src/kosmickrisp/vulkan/kk_shader.c @@ -1121,6 +1121,7 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count, const struct vk_shader_compile_info *info = &infos[i]; nir_shader *nir = info->nir; + msl_preprocess_nir_workarounds(nir, dev->disabled_workarounds); kk_lower_nir(dev, nir, info->robustness, info->set_layout_count, info->set_layouts, state);