broadcom/compiler: fix lane selection for subgroups in fragment shaders

It seems the hardware behavior for this is as per-spec and we are supposed to identify as active entire quads. Particularly, there are some derivative tests with dynamic control flow that use subgroup ballot and require this. However, we still need to exclude terminted lanes (OpTerminate). For that, we keep track of the sample mask at the start of a fagment shader start and compare it with the current sample mask. Fixes: ('broadcom/compiler: support subgroup reduction operations from fragment shaders') Fixes: dEQP-VK.glsl.derivate.dynamic_loop.* Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409>
2026-05-07 15:48:36 +02:00 · 2024-02-01 10:30:46 +01:00 · 2024-02-01 10:30:46 +01:00 · cc7934a89b
commit cc7934a89b
parent 2143da6d5a
3 changed files with 24 additions and 71 deletions
--- a/src/broadcom/ci/broadcom-rpi5-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@ -15,59 +15,3 @@ dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_usage,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.min_image_count,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.pre_transform,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.present_mode,Crash
-
-# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_mediump,Fail
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -3282,25 +3282,28 @@ emit_load_local_invocation_index(struct v3d_compile *c)

 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
 * fragment shaders a lane is considered active if any sample flags are set
- * for *any* lane in the same quad. This is not what we want. To fix this we
- * can emit MSF to get the active lanes and produce a condition that we
- * can then use with these operations to limit execution only to lanes that
- * are really active in each quad. Further, we also need to disable lanes
- * that may be disabled because of non-uniform control flow.
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
 */
 static enum v3d_qpu_cond
-setup_subgroup_reduction_condition(struct v3d_compile *c)
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
 {
        assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
               c->s->info.stage == MESA_SHADER_COMPUTE);

        enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;

-        /* Produce condition for 'lane is active' from current sample flags.
-         * Only required for fragment shaders.
+        /* We need to make sure that terminated lanes in fragment shaders are
+         * not included. We can identify these lanes by comparing the inital
+         * sample mask with the current. This fixes:
+         * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
         */
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), vir_MSF(c)),
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+                vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+                                           vir_NOT(c, vir_XOR(c, c->start_msf,
+                                                              vir_MSF(c)))),
                           V3D_QPU_PF_PUSHZ);
                cond = V3D_QPU_COND_IFNA;
        }
@ -3883,7 +3886,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_ballot: {
                assert(c->devinfo->ver >= 71);
                struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                struct qreg res = vir_get_temp(c);
                vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@ -3902,7 +3905,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_read_first_invocation: {
                assert(c->devinfo->ver >= 71);
                struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                struct qreg res = vir_get_temp(c);
                vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
                ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@ -3922,7 +3925,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_vote_ieq: {
                assert(c->devinfo->ver >= 71);
                struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                struct qreg res = vir_get_temp(c);
                vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
                             vir_ALLEQ_dest(c, res, value) :
@ -3940,7 +3943,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_vote_all: {
                assert(c->devinfo->ver >= 71);
                struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                struct qreg res = vir_get_temp(c);
                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);

@ -3964,7 +3967,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
        case nir_intrinsic_vote_any: {
                assert(c->devinfo->ver >= 71);
                struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                struct qreg res = vir_get_temp(c);
                vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);

@ -4544,6 +4547,7 @@ nir_to_vir(struct v3d_compile *c)
 {
        switch (c->s->info.stage) {
        case MESA_SHADER_FRAGMENT:
+                c->start_msf = vir_MSF(c);
                if (c->devinfo->ver < 71)
                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
                else
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -793,6 +793,11 @@ struct v3d_compile {
        struct qreg cs_shared_offset;
        int local_invocation_index_bits;

+        /* Starting value of the sample mask in a fragment shader. We use
+         * this to identify lanes that have been terminated/discarded.
+         */
+        struct qreg start_msf;
+
        /* If the shader uses subgroup functionality */
        bool has_subgroups;