kk: Expand workaround 3 to cover general use of ballot/vote ops

simd_ballot/quad_any/quad_all (and probably simd_any/simd_all) appear to
generally be broken within conditional blocks, not just with simd_is_first.

Reviewed-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41186>
This commit is contained in:
squidbus 2026-04-26 15:44:32 -07:00 committed by Marge Bot
parent 5b34d1ff34
commit 0bc87e47e2
8 changed files with 89 additions and 40 deletions

View file

@ -207,11 +207,20 @@ KK_WORKAROUND_3
| macOS version: 15.4.x
| Metal ticket: FB20113490 (@aitor)
| Metal ticket status: Waiting resolution
| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``
| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``, ``dEQP-VK.subgroups.arithmetic.graphics.*``, ``dEQP-VK.subgroups.shader_quad_control.divergent_condition``
| Comments:
``simd_is_first`` does not seem to behave as documented in the MSL
specification. The following code snippet misbehaves:
``simd_ballot`` within a conditional block does not seem to behave as
documented in the MSL specification. For example, the following code blocks
misbehave:
.. code-block:: c
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
if (execute)
temp = simd_ballot(true); /* <- This may return all active threads... */
else
temp = 2u;
.. code-block:: c
@ -220,17 +229,33 @@ specification. The following code snippet misbehaves:
else
temp = simd_ballot(true); /* <- This will return all active threads... */
The way to fix this is by changing the conditional to:
This appears to also apply to ``quad_any`` and ``quad_all``, and likely the
``simd`` equivalents as well.
The way to fix this is to use ``simd_or`` instead:
.. code-block:: c
if (simd_is_first() && (ulong)simd_ballot(true))
temp = 3u;
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
if (execute)
temp = simd_or(1 << gl_SubGroupInvocation);
else
temp = (ulong)simd_ballot(true);
temp = 2u;
Alternatively, the conditional can be changed to include ``simd_ballot(true)``:
.. code-block:: c
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
if (execute && (ulong)simd_ballot(true))
temp = simd_ballot(true);
else
temp = 2u;
| Log:
| 2025-09-09: Workaround implemented and reported to Apple
| 2026-04-28: Workaround updated to expand to all ballot/vote ops.
KK_WORKAROUND_2
---------------

View file

@ -282,6 +282,7 @@ main(int argc, char **argv)
nir_address_format_62bit_generic);
msl_preprocess_nir(s);
msl_preprocess_nir_workarounds(nir, 0);
msl_optimize_nir(nir);
NIR_PASS(_, s, nir_opt_deref);

View file

@ -50,30 +50,6 @@ lower_bool_ops(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
return true;
}
static bool
lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
b->cursor = nir_before_instr(&intr->instr);
switch (intr->intrinsic) {
case nir_intrinsic_vote_any: {
/* We don't have vote instructions, but we have efficient ballots */
nir_def *ballot = nir_ballot(b, 1, 32, intr->src[0].ssa);
nir_def_rewrite_uses(&intr->def, nir_ine_imm(b, ballot, 0));
return true;
}
case nir_intrinsic_vote_all: {
nir_def *ballot = nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa));
nir_def_rewrite_uses(&intr->def, nir_ieq_imm(b, ballot, 0));
return true;
}
default:
return false;
}
}
void
msl_nir_lower_subgroups(nir_shader *nir)
{
@ -86,13 +62,12 @@ msl_nir_lower_subgroups(nir_shader *nir)
.lower_vote_feq = true,
.lower_vote_bool_eq = true,
.lower_inverse_ballot = true,
/* Metal requires relative shuffle operations to have uniform delta */
.lower_relative_shuffle = true,
.lower_quad = true,
/* Metal reduce operations do not support certain types or cluster size */
.lower_reduce = true,
};
NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower,
nir_metadata_control_flow, NULL);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_bool_ops,
nir_metadata_control_flow, NULL);
}

View file

@ -177,6 +177,8 @@ update_instr_type(struct hash_table *types, nir_instr *instr, ti_type type)
return true;
case nir_intrinsic_read_first_invocation:
case nir_intrinsic_read_invocation:
case nir_intrinsic_quad_vote_all:
case nir_intrinsic_quad_vote_any:
case nir_intrinsic_quad_broadcast:
case nir_intrinsic_quad_swap_horizontal:
case nir_intrinsic_quad_swap_vertical:

View file

@ -7,6 +7,7 @@
#include "nir_to_msl.h"
#include "msl_private.h"
#include "nir.h"
#include "nir_builder.h"
static const char *
get_stage_string(mesa_shader_stage stage)
@ -1456,12 +1457,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
P(ctx, ");\n");
break;
case nir_intrinsic_elect:
/* KK_WORKAROUND_3 */
if (ctx->disabled_workarounds & BITFIELD64_BIT(3)) {
P(ctx, "simd_is_first();\n");
} else {
P(ctx, "simd_is_first() && (ulong)simd_ballot(true);\n");
}
P(ctx, "simd_is_first();\n");
break;
case nir_intrinsic_read_first_invocation:
P(ctx, "simd_broadcast_first(");
@ -1514,6 +1510,16 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;
case nir_intrinsic_quad_vote_all:
P(ctx, "quad_all(");
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;
case nir_intrinsic_quad_vote_any:
P(ctx, "quad_any(");
src_to_msl(ctx, &instr->src[0]);
P(ctx, ");\n");
break;
case nir_intrinsic_quad_broadcast:
P(ctx, "quad_broadcast(");
src_to_msl(ctx, &instr->src[0]);
@ -2057,6 +2063,39 @@ msl_optimize_nir(struct nir_shader *nir)
return progress;
}
static bool
lower_ballot(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
{
if (intrin->intrinsic != nir_intrinsic_ballot)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def* invocation = nir_load_subgroup_invocation(b);
nir_def* mask = nir_ishl(b, nir_b2i32(b, intrin->src[0].ssa), invocation);
nir_def* reduce = nir_reduce(b, mask, .reduction_op = nir_op_ior);
nir_def_rewrite_uses(&intrin->def, reduce);
return true;
}
void msl_preprocess_nir_workarounds(struct nir_shader *nir,
uint64_t disabled_workarounds)
{
/* KK_WORKAROUND_3 */
if (!(disabled_workarounds & BITFIELD64_BIT(3))) {
const nir_lower_subgroups_options subgroups_options = {
.subgroup_size = 32,
.ballot_bit_size = 32,
.ballot_components = 1,
.lower_vote = true,
.lower_quad_vote = true,
};
NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_ballot,
nir_metadata_control_flow, NULL);
}
}
/* Scalarize stores to CLIP_DIST* varyings */
static bool
scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin,

View file

@ -31,6 +31,11 @@ bool msl_optimize_nir(struct nir_shader *nir);
/* Call this before all API-speicific lowerings, it will */
void msl_preprocess_nir(struct nir_shader *nir);
/* Call this before all API-specific lowerings. It will pre-process with
* instruction workarounds based on the disabled workarounds bitmask. */
void msl_preprocess_nir_workarounds(struct nir_shader *nir,
uint64_t disabled_workarounds);
enum msl_tex_access_flag {
MSL_ACCESS_SAMPLE = 0,
MSL_ACCESS_READ,

View file

@ -69,6 +69,7 @@ static void
optimize(nir_shader *nir)
{
msl_preprocess_nir(nir);
msl_preprocess_nir_workarounds(nir, 0);
nir_lower_compute_system_values_options csv_options = {
.has_base_global_invocation_id = 0,

View file

@ -1121,6 +1121,7 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
const struct vk_shader_compile_info *info = &infos[i];
nir_shader *nir = info->nir;
msl_preprocess_nir_workarounds(nir, dev->disabled_workarounds);
kk_lower_nir(dev, nir, info->robustness, info->set_layout_count,
info->set_layouts, state);