From cc7934a89bd15099f7fd30fab8c8c27a9d9fc1ac Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Thu, 1 Feb 2024 10:30:46 +0100
Subject: [PATCH] broadcom/compiler: fix lane selection for subgroups in
 fragment shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems the hardware behavior for this is as per-spec and we are
supposed to identify as active entire quads. Particularly, there
are some derivative tests with dynamic control flow that use
subgroup ballot and require this.

However, we still need to exclude terminted lanes (OpTerminate). For
that, we keep track of the sample mask at the start of a fagment
shader start and compare it with the current sample mask.

Fixes: ('broadcom/compiler: support subgroup reduction operations from fragment shaders')
Fixes: dEQP-VK.glsl.derivate.dynamic_loop.*
Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409>
---
 src/broadcom/ci/broadcom-rpi5-fails.txt | 56 -------------------------
 src/broadcom/compiler/nir_to_vir.c      | 34 ++++++++-------
 src/broadcom/compiler/v3d_compiler.h    |  5 +++
 3 files changed, 24 insertions(+), 71 deletions(-)

diff --git a/src/broadcom/ci/broadcom-rpi5-fails.txt b/src/broadcom/ci/broadcom-rpi5-fails.txt
index 648dfd689a3..cf2598f6688 100644
--- a/src/broadcom/ci/broadcom-rpi5-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@@ -15,59 +15,3 @@ dEQP-VK.wsi.wayland.swapchain.simulate_oom.image_usage,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.min_image_count,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.pre_transform,Crash
 dEQP-VK.wsi.wayland.swapchain.simulate_oom.present_mode,Crash
-
-# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27409
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdx.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdxcoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdxfine.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdy.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdycoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.dfdyfine.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidth.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidthcoarse.dynamic_loop.vec4_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.float_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec2_mediump,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_highp,Fail
-dEQP-VK.glsl.derivate.fwidthfine.dynamic_loop.vec4_mediump,Fail
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index e6ed160523a..cee1cd612a7 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -3282,25 +3282,28 @@ emit_load_local_invocation_index(struct v3d_compile *c)
 
 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
  * fragment shaders a lane is considered active if any sample flags are set
- * for *any* lane in the same quad. This is not what we want. To fix this we
- * can emit MSF to get the active lanes and produce a condition that we
- * can then use with these operations to limit execution only to lanes that
- * are really active in each quad. Further, we also need to disable lanes
- * that may be disabled because of non-uniform control flow.
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
  */
 static enum v3d_qpu_cond
-setup_subgroup_reduction_condition(struct v3d_compile *c)
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
 {
         assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
                c->s->info.stage == MESA_SHADER_COMPUTE);
 
         enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
 
-        /* Produce condition for 'lane is active' from current sample flags.
-         * Only required for fragment shaders.
+        /* We need to make sure that terminated lanes in fragment shaders are
+         * not included. We can identify these lanes by comparing the inital
+         * sample mask with the current. This fixes:
+         * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
          */
-        if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-                vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), vir_MSF(c)),
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+                vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+                                           vir_NOT(c, vir_XOR(c, c->start_msf,
+                                                              vir_MSF(c)))),
                            V3D_QPU_PF_PUSHZ);
                 cond = V3D_QPU_COND_IFNA;
         }
@@ -3883,7 +3886,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_ballot: {
                 assert(c->devinfo->ver >= 71);
                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                 struct qreg res = vir_get_temp(c);
                 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@@ -3902,7 +3905,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_read_first_invocation: {
                 assert(c->devinfo->ver >= 71);
                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                 struct qreg res = vir_get_temp(c);
                 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
@@ -3922,7 +3925,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_vote_ieq: {
                 assert(c->devinfo->ver >= 71);
                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                 struct qreg res = vir_get_temp(c);
                 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
                              vir_ALLEQ_dest(c, res, value) :
@@ -3940,7 +3943,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_vote_all: {
                 assert(c->devinfo->ver >= 71);
                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                 struct qreg res = vir_get_temp(c);
                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
 
@@ -3964,7 +3967,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_vote_any: {
                 assert(c->devinfo->ver >= 71);
                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
-                enum v3d_qpu_cond cond = setup_subgroup_reduction_condition(c);
+                enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
                 struct qreg res = vir_get_temp(c);
                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
 
@@ -4544,6 +4547,7 @@ nir_to_vir(struct v3d_compile *c)
 {
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
+                c->start_msf = vir_MSF(c);
                 if (c->devinfo->ver < 71)
                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
                 else
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index ef6cd13d6f3..12aaacdc14a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -793,6 +793,11 @@ struct v3d_compile {
         struct qreg cs_shared_offset;
         int local_invocation_index_bits;
 
+        /* Starting value of the sample mask in a fragment shader. We use
+         * this to identify lanes that have been terminated/discarded.
+         */
+        struct qreg start_msf;
+
         /* If the shader uses subgroup functionality */
         bool has_subgroups;