mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-15 12:08:14 +02:00
kk: Expand workaround 3 to cover general use of ballot/vote ops
simd_ballot/quad_any/quad_all (and probably simd_any/simd_all) appear to generally be broken within conditional blocks, not just with simd_is_first. Reviewed-by: Aitor Camacho <aitor@lunarg.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41186>
This commit is contained in:
parent
5b34d1ff34
commit
0bc87e47e2
8 changed files with 89 additions and 40 deletions
|
|
@ -207,11 +207,20 @@ KK_WORKAROUND_3
|
|||
| macOS version: 15.4.x
|
||||
| Metal ticket: FB20113490 (@aitor)
|
||||
| Metal ticket status: Waiting resolution
|
||||
| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``
|
||||
| CTS test failure: ``dEQP-VK.subgroups.ballot_other.*.subgroupballotfindlsb``, ``dEQP-VK.subgroups.arithmetic.graphics.*``, ``dEQP-VK.subgroups.shader_quad_control.divergent_condition``
|
||||
| Comments:
|
||||
|
||||
``simd_is_first`` does not seem to behave as documented in the MSL
|
||||
specification. The following code snippet misbehaves:
|
||||
``simd_ballot`` within a conditional block does not seem to behave as
|
||||
documented in the MSL specification. For example, the following code blocks
|
||||
misbehave:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
|
||||
if (execute)
|
||||
temp = simd_ballot(true); /* <- This may return all active threads... */
|
||||
else
|
||||
temp = 2u;
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
|
|
@ -220,17 +229,33 @@ specification. The following code snippet misbehaves:
|
|||
else
|
||||
temp = simd_ballot(true); /* <- This will return all active threads... */
|
||||
|
||||
The way to fix this is by changing the conditional to:
|
||||
This appears to also apply to ``quad_any`` and ``quad_all``, and likely the
|
||||
``simd`` equivalents as well.
|
||||
|
||||
The way to fix this is to use ``simd_or`` instead:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
if (simd_is_first() && (ulong)simd_ballot(true))
|
||||
temp = 3u;
|
||||
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
|
||||
if (execute)
|
||||
temp = simd_or(1 << gl_SubGroupInvocation);
|
||||
else
|
||||
temp = (ulong)simd_ballot(true);
|
||||
temp = 2u;
|
||||
|
||||
Alternatively, the conditional can be changed to include ``simd_ballot(true)``:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
bool execute = (gl_SubGroupInvocation & 1u) != 0u;
|
||||
if (execute && (ulong)simd_ballot(true))
|
||||
temp = simd_ballot(true);
|
||||
else
|
||||
temp = 2u;
|
||||
|
||||
|
||||
| Log:
|
||||
| 2025-09-09: Workaround implemented and reported to Apple
|
||||
| 2026-04-28: Workaround updated to expand to all ballot/vote ops.
|
||||
|
||||
KK_WORKAROUND_2
|
||||
---------------
|
||||
|
|
|
|||
|
|
@ -282,6 +282,7 @@ main(int argc, char **argv)
|
|||
nir_address_format_62bit_generic);
|
||||
|
||||
msl_preprocess_nir(s);
|
||||
msl_preprocess_nir_workarounds(nir, 0);
|
||||
msl_optimize_nir(nir);
|
||||
|
||||
NIR_PASS(_, s, nir_opt_deref);
|
||||
|
|
|
|||
|
|
@ -50,30 +50,6 @@ lower_bool_ops(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_vote_any: {
|
||||
/* We don't have vote instructions, but we have efficient ballots */
|
||||
nir_def *ballot = nir_ballot(b, 1, 32, intr->src[0].ssa);
|
||||
nir_def_rewrite_uses(&intr->def, nir_ine_imm(b, ballot, 0));
|
||||
return true;
|
||||
}
|
||||
|
||||
case nir_intrinsic_vote_all: {
|
||||
nir_def *ballot = nir_ballot(b, 1, 32, nir_inot(b, intr->src[0].ssa));
|
||||
nir_def_rewrite_uses(&intr->def, nir_ieq_imm(b, ballot, 0));
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
msl_nir_lower_subgroups(nir_shader *nir)
|
||||
{
|
||||
|
|
@ -86,13 +62,12 @@ msl_nir_lower_subgroups(nir_shader *nir)
|
|||
.lower_vote_feq = true,
|
||||
.lower_vote_bool_eq = true,
|
||||
.lower_inverse_ballot = true,
|
||||
/* Metal requires relative shuffle operations to have uniform delta */
|
||||
.lower_relative_shuffle = true,
|
||||
.lower_quad = true,
|
||||
/* Metal reduce operations do not support certain types or cluster size */
|
||||
.lower_reduce = true,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower,
|
||||
nir_metadata_control_flow, NULL);
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_bool_ops,
|
||||
nir_metadata_control_flow, NULL);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -177,6 +177,8 @@ update_instr_type(struct hash_table *types, nir_instr *instr, ti_type type)
|
|||
return true;
|
||||
case nir_intrinsic_read_first_invocation:
|
||||
case nir_intrinsic_read_invocation:
|
||||
case nir_intrinsic_quad_vote_all:
|
||||
case nir_intrinsic_quad_vote_any:
|
||||
case nir_intrinsic_quad_broadcast:
|
||||
case nir_intrinsic_quad_swap_horizontal:
|
||||
case nir_intrinsic_quad_swap_vertical:
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "nir_to_msl.h"
|
||||
#include "msl_private.h"
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
|
||||
static const char *
|
||||
get_stage_string(mesa_shader_stage stage)
|
||||
|
|
@ -1456,12 +1457,7 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
|||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_elect:
|
||||
/* KK_WORKAROUND_3 */
|
||||
if (ctx->disabled_workarounds & BITFIELD64_BIT(3)) {
|
||||
P(ctx, "simd_is_first();\n");
|
||||
} else {
|
||||
P(ctx, "simd_is_first() && (ulong)simd_ballot(true);\n");
|
||||
}
|
||||
P(ctx, "simd_is_first();\n");
|
||||
break;
|
||||
case nir_intrinsic_read_first_invocation:
|
||||
P(ctx, "simd_broadcast_first(");
|
||||
|
|
@ -1514,6 +1510,16 @@ intrinsic_to_msl(struct nir_to_msl_ctx *ctx, nir_intrinsic_instr *instr)
|
|||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_quad_vote_all:
|
||||
P(ctx, "quad_all(");
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_quad_vote_any:
|
||||
P(ctx, "quad_any(");
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
P(ctx, ");\n");
|
||||
break;
|
||||
case nir_intrinsic_quad_broadcast:
|
||||
P(ctx, "quad_broadcast(");
|
||||
src_to_msl(ctx, &instr->src[0]);
|
||||
|
|
@ -2057,6 +2063,39 @@ msl_optimize_nir(struct nir_shader *nir)
|
|||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_ballot(nir_builder *b, nir_intrinsic_instr *intrin, void *_unused)
|
||||
{
|
||||
if (intrin->intrinsic != nir_intrinsic_ballot)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
nir_def* invocation = nir_load_subgroup_invocation(b);
|
||||
nir_def* mask = nir_ishl(b, nir_b2i32(b, intrin->src[0].ssa), invocation);
|
||||
nir_def* reduce = nir_reduce(b, mask, .reduction_op = nir_op_ior);
|
||||
nir_def_rewrite_uses(&intrin->def, reduce);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void msl_preprocess_nir_workarounds(struct nir_shader *nir,
|
||||
uint64_t disabled_workarounds)
|
||||
{
|
||||
/* KK_WORKAROUND_3 */
|
||||
if (!(disabled_workarounds & BITFIELD64_BIT(3))) {
|
||||
const nir_lower_subgroups_options subgroups_options = {
|
||||
.subgroup_size = 32,
|
||||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.lower_vote = true,
|
||||
.lower_quad_vote = true,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_lower_subgroups, &subgroups_options);
|
||||
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_ballot,
|
||||
nir_metadata_control_flow, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Scalarize stores to CLIP_DIST* varyings */
|
||||
static bool
|
||||
scalarize_clip_distance_filter(const nir_intrinsic_instr *intrin,
|
||||
|
|
|
|||
|
|
@ -31,6 +31,11 @@ bool msl_optimize_nir(struct nir_shader *nir);
|
|||
/* Call this before all API-speicific lowerings, it will */
|
||||
void msl_preprocess_nir(struct nir_shader *nir);
|
||||
|
||||
/* Call this before all API-specific lowerings. It will pre-process with
|
||||
* instruction workarounds based on the disabled workarounds bitmask. */
|
||||
void msl_preprocess_nir_workarounds(struct nir_shader *nir,
|
||||
uint64_t disabled_workarounds);
|
||||
|
||||
enum msl_tex_access_flag {
|
||||
MSL_ACCESS_SAMPLE = 0,
|
||||
MSL_ACCESS_READ,
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ static void
|
|||
optimize(nir_shader *nir)
|
||||
{
|
||||
msl_preprocess_nir(nir);
|
||||
msl_preprocess_nir_workarounds(nir, 0);
|
||||
|
||||
nir_lower_compute_system_values_options csv_options = {
|
||||
.has_base_global_invocation_id = 0,
|
||||
|
|
|
|||
|
|
@ -1121,6 +1121,7 @@ kk_compile_shaders(struct vk_device *device, uint32_t shader_count,
|
|||
const struct vk_shader_compile_info *info = &infos[i];
|
||||
nir_shader *nir = info->nir;
|
||||
|
||||
msl_preprocess_nir_workarounds(nir, dev->disabled_workarounds);
|
||||
kk_lower_nir(dev, nir, info->robustness, info->set_layout_count,
|
||||
info->set_layouts, state);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue